From 6c4c12c950362d283ebe636d6f8571e551146295 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 1 Dec 2017 18:54:21 -0500
Subject: [PATCH 001/184] [src] Bug-fix to conceptual bug in Minimum Bayes
 Risk/sausage code.  Thanks:@jtrmal

---
 src/lat/sausages.cc |  8 ++++----
 src/lat/sausages.h  | 24 ++++++++++++++++++++----
 2 files changed, 24 insertions(+), 8 deletions(-)
diff --git a/src/lat/sausages.cc b/src/lat/sausages.cc
index 7cb7a273b98..16a61b3f5eb 100644
--- a/src/lat/sausages.cc
+++ b/src/lat/sausages.cc
@@ -114,11 +114,11 @@ double MinimumBayesRisk::EditDistance(int32 N, int32 Q,
       for (int32 q = 0; q <= Q; q++) {
         if (q == 0) {
           alpha_dash_arc(q) = // line 15.
-              alpha_dash(s_a, q) + l(w_a, 0) + delta();
+              alpha_dash(s_a, q) + l(w_a, 0, true);
         } else {  // a1,a2,a3 are the 3 parts of min expression of line 17.
           int32 r_q = r(q);
           double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q),
-              a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(),
+              a2 = alpha_dash(s_a, q) + l(w_a, 0, true),
               a3 = alpha_dash_arc(q-1) + l(0, r_q);
           alpha_dash_arc(q) = std::min(a1, std::min(a2, a3));
         }
@@ -166,11 +166,11 @@ void MinimumBayesRisk::AccStats() {
       const Arc &arc = arcs_[pre_[n][i]];
       int32 s_a = arc.start_node, w_a = arc.word;
       BaseFloat p_a = arc.loglike;
-      alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0) + delta(); // line 14.
+      alpha_dash_arc(0) = alpha_dash(s_a, 0) + l(w_a, 0, true); // line 14.
       for (int32 q = 1; q <= Q; q++) { // this loop == lines 15-18.
         int32 r_q = r(q);
         double a1 = alpha_dash(s_a, q-1) + l(w_a, r_q),
-            a2 = alpha_dash(s_a, q) + l(w_a, 0) + delta(),
+            a2 = alpha_dash(s_a, q) + l(w_a, 0, true),
             a3 = alpha_dash_arc(q-1) + l(0, r_q);
         if (a1 <= a2) {
           if (a1 <= a3) { b_arc[q] = 1; alpha_dash_arc(q) = a1; }
diff --git a/src/lat/sausages.h b/src/lat/sausages.h
index a6af91cc12f..9dab0b68713 100644
--- a/src/lat/sausages.h
+++ b/src/lat/sausages.h
@@ -128,8 +128,18 @@ class MinimumBayesRisk {
   /// Minimum-Bayes-Risk Decode. Top-level algorithm.  Figure 6 of the paper.
   void MbrDecode();
 
-  /// The basic edit-distance function l(a,b), as in the paper.
-  inline double l(int32 a, int32 b) { return (a == b ? 0.0 : 1.0); }
+  /// Without the 'penalize' argument this gives us the basic edit-distance
+  /// function l(a,b), as in the paper.
+  /// With the 'penalize' argument it can be interpreted as the edit distance
+  /// plus the 'delta' from the paper, except that we make a kind of conceptual
+  /// bug-fix and only apply the delta if the edit-distance was not already
+  /// zero.  This bug-fix was necessary in order to force all the stats to show
+  /// up, that should show up, and applying the bug-fix makes the sausage stats
+  /// significantly less sparse.
+  inline double l(int32 a, int32 b, bool penalize = false) {
+    if (a == b) return 0.0;
+    else return (penalize ? 1.0 + delta() : 1.0);
+  }
 
   /// returns r_q, in one-based indexing, as in the paper.
   inline int32 r(int32 q) { return R_[q-1]; }
@@ -151,8 +161,14 @@ class MinimumBayesRisk {
   // epsilon (0).  (But if no words in vec, just one epsilon)
   static void NormalizeEps(std::vector<int32> *vec);
 
-  static inline BaseFloat delta() { return 1.0e-05; } // A constant
-  // used in the algorithm.
+  // delta() is a constant used in the algorithm, which penalizes
+  // the use of certain epsilon transitions in the edit-distance which would cause
+  // words not to show up in the accumulated edit-distance statistics.
+  // There has been a conceptual bug-fix versus the way it was presented in
+  // the paper: we now add delta only if the edit-distance was not already
+  // zero.
+  static inline BaseFloat delta() { return 1.0e-05; }
+
 
   /// Function used to increment map.
   static inline void AddToMap(int32 i, double d, std::map<int32, double> *gamma) {

From 38e63553519a901e391419534c31d189300bb8de Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 2 Dec 2017 15:23:32 -0500
Subject: [PATCH 002/184] [scripts] Support batchnorm after LSTM layers.

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 84 ++++++++++++++-----
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |  4 +
 2 files changed, 66 insertions(+), 22 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 9743d0100b9..96f63537a55 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -103,7 +103,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -113,7 +113,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -258,6 +258,8 @@ def generate_lstm_config(self):
 
 # This class is for lines like
 #   'lstmp-layer name=lstm1 input=[-1] delay=-3'
+# (you can also use the name 'lstmp-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph with output projections. It can also generate
 # outputs without projection, but you could use the XconfigLstmLayer for this
 # simple LSTM.
@@ -292,7 +294,9 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "lstmp-layer"
+        # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm
+        # component.
+        assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -353,7 +357,8 @@ def auxiliary_outputs(self):
         return ['c_t']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'rp_t'
+        node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer'
+                      else 'rp_t' )
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
                 node_name = auxiliary_output
@@ -375,7 +380,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -385,7 +390,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -542,18 +547,27 @@ def generate_lstm_config(self):
         configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}"
                        "".format(name, rec_proj_dim, bptrunc_str))
 
-        configs.append("# r_t and p_t : rp_t will be the output")
+        configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)")
         configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t"
                        "".format(name))
         configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 "
                        "dim={1}".format(name, rec_proj_dim))
         configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name))
 
+        if self.layer_type == "lstmp-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, rec_proj_dim + nonrec_proj_dim))
+            configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm "
+                           "input={0}.rp_t".format(name))
+
         return configs
 
 
 # This class is for lines like
 #   'fast-lstm-layer name=lstm1 input=[-1] delay=-3'
+# (you can also use the name 'fast-lstm-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph without output projections.
 # Unlike 'lstm-layer', the core nonlinearities of the LSTM are done in a special-purpose
 # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
@@ -586,7 +600,7 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigFastLstmLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstm-layer"
+        assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -626,7 +640,8 @@ def auxiliary_outputs(self):
         return ['c']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'm'
+        node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer'
+                      else 'm')
         if auxiliary_output is not None:
             if auxiliary_output == 'c':
                 node_name = 'c'
@@ -647,7 +662,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -657,7 +672,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -723,7 +738,13 @@ def generate_lstm_config(self):
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
-        # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+
+        if self.layer_type == "fast-lstm-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, cell_dim))
+            configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
+                           "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
@@ -731,6 +752,8 @@ def generate_lstm_config(self):
 
 # This class is for lines like
 #   'fast-lstmb-layer name=lstm1 input=[-1] delay=-3'
+# (you can also call it 'fast-lstmb-batchnorm-layer' if you want it to end
+# in a batchnorm component).
 # It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix
 # of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide
 # it into two matrices, with batch-norm in between to stabilize the training.
@@ -763,7 +786,7 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigFastLstmbLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstmb-layer"
+        assert first_token in [ 'fast-lstmb-layer', 'fast-lstmb-batchnorm-layer' ]
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -807,7 +830,8 @@ def auxiliary_outputs(self):
         return ['c']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'm'
+        node_name = ('m_batchnorm' if self.layer_type == 'fast-lstmb-batchnorm-layer'
+                      else 'm')
         if auxiliary_output is not None:
             if auxiliary_output == 'c':
                 node_name = 'c'
@@ -828,7 +852,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -838,7 +862,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
+    def _generate_lstm_config(self):
 
         # assign some variables to reduce verbosity
         name = self.name
@@ -923,6 +947,13 @@ def generate_lstm_config(self):
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
         # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
+
+        if self.layer_type == "fast-lstmb-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, cell_dim))
+            configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
+                           "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
@@ -933,6 +964,8 @@ def generate_lstm_config(self):
 #   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3'
 # or:
 #   'fast-lstmp-layer name=lstm1 input=[-1] delay=-3 cell-dim=1024 recurrent-projection-dim=512 non-recurrent-projection-dim=512'
+# (you can also use the name 'fast-lstmp-batchnorm-layer' if you want it to be followed
+# by batchnorm).
 # It generates an LSTM sub-graph with output projections (i.e. a projected LSTM, AKA LSTMP).
 # Unlike 'lstmp-layer', the core nonlinearities of the LSTM are done in a special-purpose
 # component (LstmNonlinearityComponent), and most of the affine parts of the LSTM are combined
@@ -968,7 +1001,7 @@ def generate_lstm_config(self):
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
 class XconfigFastLstmpLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token == "fast-lstmp-layer"
+        assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer']
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
@@ -1026,7 +1059,8 @@ def auxiliary_outputs(self):
         return ['c_t']
 
     def output_name(self, auxiliary_output = None):
-        node_name = 'rp'
+        node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer'
+                     else 'rp')
         if auxiliary_output is not None:
             if auxiliary_output in self.auxiliary_outputs():
                 node_name = auxiliary_output
@@ -1048,7 +1082,7 @@ def output_dim(self, auxiliary_output = None):
 
     def get_full_config(self):
         ans = []
-        config_lines = self.generate_lstm_config()
+        config_lines = self._generate_lstm_config()
 
         for line in config_lines:
             for config_name in ['ref', 'final']:
@@ -1058,8 +1092,7 @@ def get_full_config(self):
         return ans
 
     # convenience function to generate the LSTM config
-    def generate_lstm_config(self):
-
+    def _generate_lstm_config(self):
         # assign some variables to reduce verbosity
         name = self.name
         # in the below code we will just call descriptor_strings as descriptors for conciseness
@@ -1145,7 +1178,8 @@ def generate_lstm_config(self):
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "
                        "dim-offset={1} dim={1}".format(name, cell_dim))
-        configs.append("# {0}.rp is the output node of this layer:".format(name))
+        configs.append("# {0}.rp is the output node of this layer (if we're not "
+                       "including batchnorm)".format(name))
         configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name))
         configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 "
                        "dim={1}".format(name, rec_proj_dim))
@@ -1158,6 +1192,12 @@ def generate_lstm_config(self):
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc "
                        "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim))
+        if self.layer_type == "fast-lstmp-batchnorm-layer":
+            # Add the batchnorm component, if requested to include batchnorm.
+            configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format(
+                name, rec_proj_dim + nonrec_proj_dim))
+            configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm "
+                           "input={0}.rp".format(name))
         configs.append("### End LSTM Layer '{0}'".format(name))
 
         return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 99424cd535e..c41b1092da1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -34,9 +34,13 @@
         'affine-layer' : xlayers.XconfigAffineLayer,
         'lstm-layer' : xlayers.XconfigLstmLayer,
         'lstmp-layer' : xlayers.XconfigLstmpLayer,
+        'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer,
         'fast-lstm-layer' : xlayers.XconfigFastLstmLayer,
+        'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer,
+        'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer,
         'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer,
+        'fast-lstmb-batchnorm-layer' : xlayers.XconfigFastLstmbLayer,
         'stats-layer': xlayers.XconfigStatsLayer,
         'relu-conv-layer': xlayers.XconfigConvLayer,
         'conv-layer': xlayers.XconfigConvLayer,

From 1a1bc73ebd5713fba3848d9e2b831e693262acd3 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Fri, 8 Dec 2017 23:31:52 -0500
Subject: [PATCH 003/184] model combine by averaging

---
 src/chainbin/nnet3-chain-combine.cc | 90 ++++++++++++++++++++++-----
 src/nnet3bin/nnet3-combine.cc       | 95 +++++++++++++++++++++++------
 2 files changed, 153 insertions(+), 32 deletions(-)

diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 3c44e6b904c..9100fbe3132 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -1,6 +1,7 @@
 // chainbin/nnet3-chain-combine.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2017  Yiming Wang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,7 +20,56 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "nnet3/nnet-chain-combine.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-chain-diagnostics.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+double ComputeObjf(const std::vector<NnetChainExample> &egs,
+                   NnetChainComputeProb *prob_computer) {
+  prob_computer->Reset();
+  std::vector<NnetChainExample>::const_iterator iter = egs.begin(),
+                                                 end = egs.end();
+  for (; iter != end; ++iter)
+    prob_computer->Compute(*iter);
+  const ChainObjectiveInfo *objf_info =
+      prob_computer->GetObjective("output");
+  if (objf_info == NULL)
+    KALDI_ERR << "Error getting objective info (unsuitable egs?)";
+  KALDI_ASSERT(objf_info->tot_weight > 0.0);
+  // we prefer to deal with normalized objective functions.
+  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
+}
+
+// Note: the object that prob_computer.nnet_ refers to should be
+// *moving_average_nnet.
+double UpdateNnetMovingAverageAndComputeObjf(int32 num_models,
+    const std::vector<NnetChainExample> &egs,
+    const Nnet &nnet, Nnet *moving_average_nnet,
+    NnetChainComputeProb *prob_computer) {
+  int32 num_params = NumParameters(nnet);
+  KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet));
+  Vector<BaseFloat> nnet_params(num_params, kUndefined),
+      moving_average_nnet_params(num_params, kUndefined);
+  VectorizeNnet(nnet, &nnet_params);
+  VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params);
+  moving_average_nnet_params.Scale((num_models - 1.0) / num_models);
+  moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params);
+
+  BaseFloat sum = moving_average_nnet_params.Sum();
+  // inf/nan parameters->return -inf objective.
+  if (!(sum == sum && sum - sum == 0))
+    return -std::numeric_limits<double>::infinity();
+
+  UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet);
+  return ComputeObjf(egs, prob_computer);
+}
+
+}
+}
 
 
 int main(int argc, char *argv[]) {
@@ -30,9 +80,11 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int64 int64;
 
     const char *usage =
-        "Using a subset of training or held-out nnet3+chain examples, compute an\n"
-        "optimal combination of  anumber of nnet3 neural nets by maximizing the\n"
-        "'chain' objective function.  See documentation of options for more details.\n"
+        "Using a subset of training or held-out nnet3+chain examples, compute\n"
+        "the average over the first n nnet models where we maximize the\n"
+        "'chain' objective function for n. Note that the order of models has\n"
+        "been reversed before feeding into this binary. So we are actually\n"
+        "combining last n models.\n"
         "Inputs and outputs are nnet3 raw nnets.\n"
         "\n"
         "Usage:  nnet3-chain-combine [options] <den-fst> <raw-nnet-in1> <raw-nnet-in2> ... <raw-nnet-inN> <chain-examples-in> <raw-nnet-out>\n"
@@ -44,7 +96,6 @@ int main(int argc, char *argv[]) {
     bool batchnorm_test_mode = false,
         dropout_test_mode = true;
     std::string use_gpu = "yes";
-    NnetCombineConfig combine_config;
     chain::ChainTrainingOptions chain_config;
 
     ParseOptions po(usage);
@@ -57,7 +108,6 @@ int main(int argc, char *argv[]) {
                 "If true, set test-mode to true on any DropoutComponents and "
                 "DropoutMaskComponents.");
 
-    combine_config.Register(&po);
     chain_config.Register(&po);
 
     po.Read(argc, argv);
@@ -83,6 +133,10 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     ReadKaldiObject(raw_nnet_rxfilename, &nnet);
+    Nnet moving_average_nnet(nnet), best_nnet(nnet);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetChainComputeProb *prob_computer = new NnetChainComputeProb(
+        compute_prob_opts, chain_config, den_fst, moving_average_nnet);
 
     if (batchnorm_test_mode)
       SetBatchnormTestMode(true, &nnet);
@@ -102,28 +156,36 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(!egs.empty());
     }
 
+    int32 best_n = 1;
+    double best_objf = ComputeObjf(egs, prob_computer);
+    KALDI_LOG << "objective function using the last model is " << best_objf;
 
     int32 num_nnets = po.NumArgs() - 3;
-    NnetChainCombiner combiner(combine_config, chain_config,
-                               num_nnets, egs, den_fst, nnet);
 
     for (int32 n = 1; n < num_nnets; n++) {
       std::string this_nnet_rxfilename = po.GetArg(n + 2);
       ReadKaldiObject(this_nnet_rxfilename, &nnet);
-      combiner.AcceptNnet(nnet);
+      double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet,
+          &moving_average_nnet, prob_computer);
+      KALDI_LOG << "Combining last " << n + 1
+                << " models, objective function is " << objf;
+      if (objf > best_objf) {
+        best_objf = objf;
+        best_nnet = moving_average_nnet;
+        best_n = n + 1;
+      }
     }
 
-    combiner.Combine();
-
-    nnet = combiner.GetNnet();
     if (HasBatchnorm(nnet))
-      RecomputeStats(egs, chain_config, den_fst, &nnet);
+      RecomputeStats(egs, chain_config, den_fst, &best_nnet);
 
 #if HAVE_CUDA==1
     CuDevice::Instantiate().PrintProfile();
 #endif
 
-    WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+    WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
+    KALDI_LOG << "Using the model averaged over last " << best_n
+              << " models, objective function is " << best_objf;
 
     KALDI_LOG << "Finished combining neural nets, wrote model to "
               << nnet_wxfilename;
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 128a9642ec4..e4181d5133b 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -1,6 +1,7 @@
 // nnet3bin/nnet3-combine.cc
 
 // Copyright 2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2017  Yiming Wang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -19,8 +20,54 @@
 
 #include "base/kaldi-common.h"
 #include "util/common-utils.h"
-#include "nnet3/nnet-combine.h"
+#include "nnet3/nnet-utils.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/nnet-diagnostics.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+double ComputeObjf(const std::vector<NnetExample> &egs,
+                   NnetComputeProb *prob_computer) {
+  prob_computer->Reset();
+  std::vector<NnetExample>::const_iterator iter = egs.begin(),
+                                            end = egs.end();
+  for (; iter != end; ++iter)
+    prob_computer->Compute(*iter);
+  double tot_weights,
+      tot_objf = prob_computer->GetTotalObjective(&tot_weights);
+  KALDI_ASSERT(tot_weights > 0.0);
+  // we prefer to deal with normalized objective functions.
+  return tot_objf / tot_weights;
+}
+
+// Note: the object that prob_computer.nnet_ refers to should be
+// *moving_average_nnet.
+double UpdateNnetMovingAverageAndComputeObjf(int32 num_models,
+    const std::vector<NnetExample> &egs,
+    const Nnet &nnet, Nnet *moving_average_nnet,
+    NnetComputeProb *prob_computer) {
+  int32 num_params = NumParameters(nnet);
+  KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet));
+  Vector<BaseFloat> nnet_params(num_params, kUndefined),
+      moving_average_nnet_params(num_params, kUndefined);
+  VectorizeNnet(nnet, &nnet_params);
+  VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params);
+  moving_average_nnet_params.Scale((num_models - 1.0) / num_models);
+  moving_average_nnet_params.AddVec(1.0 / num_models,  nnet_params);
+
+  BaseFloat sum = moving_average_nnet_params.Sum();
+  // inf/nan parameters->return -inf objective.
+  if (!(sum == sum && sum - sum == 0))
+    return -std::numeric_limits<double>::infinity();
+
+  UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet);
+  return ComputeObjf(egs, prob_computer);
+}
 
+}
+}
 
 int main(int argc, char *argv[]) {
   try {
@@ -30,11 +77,13 @@ int main(int argc, char *argv[]) {
     typedef kaldi::int64 int64;
 
     const char *usage =
-        "Using a subset of training or held-out examples, compute an optimal combination of a\n"
-        "number of nnet3 neural nets by maximizing the objective function.  See documentation of\n"
-        "options for more details.  Inputs and outputs are 'raw' nnets.\n"
+        "Using a subset of training or held-out examples, compute the average\n"
+        "over the first n nnet3 models where we maxize the objective function\n"
+        "for n. Note that the order of models has been reversed before\n"
+        "feeding into this binary. So we are actually combining last n models.\n"
+        "Inputs and outputs are 'raw' nnets.\n"
         "\n"
-        "Usage:  nnet3-combine [options] <nnet-in1> <nnet-in2> ... <nnet-inN> <valid-examples-in> <nnet-out>\n"
+        "Usage:  nnet3-combine [options] <nnet-in1> <nnet-in2> ... <nnet-inN> <valid-example-in> <nnet-out>\n"
         "\n"
         "e.g.:\n"
         " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n";
@@ -43,7 +92,6 @@ int main(int argc, char *argv[]) {
     bool batchnorm_test_mode = false,
         dropout_test_mode = true;
     std::string use_gpu = "yes";
-    NnetCombineConfig combine_config;
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
@@ -55,8 +103,6 @@ int main(int argc, char *argv[]) {
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
-    combine_config.Register(&po);
-
     po.Read(argc, argv);
 
     if (po.NumArgs() < 3) {
@@ -75,6 +121,10 @@ int main(int argc, char *argv[]) {
 
     Nnet nnet;
     ReadKaldiObject(nnet_rxfilename, &nnet);
+    Nnet moving_average_nnet(nnet), best_nnet(nnet);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetComputeProb *prob_computer = new NnetComputeProb(compute_prob_opts,
+                                                         moving_average_nnet);
 
     if (batchnorm_test_mode)
       SetBatchnormTestMode(true, &nnet);
@@ -94,24 +144,33 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(!egs.empty());
     }
 
+    int32 best_n = 1;
+    double best_objf = ComputeObjf(egs, prob_computer);
+    KALDI_LOG << "objective function using the last model is " << best_objf;
 
-    int32 num_nnets = po.NumArgs() - 2;
-    if (num_nnets > 1 || !combine_config.enforce_sum_to_one) {
-      NnetCombiner combiner(combine_config, num_nnets, egs, nnet);
-
-      for (int32 n = 1; n < num_nnets; n++) {
+    int32 num_inputs = po.NumArgs() - 2;
+    if (num_inputs > 1) {
+      for (int32 n = 1; n < num_inputs; n++) {
         ReadKaldiObject(po.GetArg(1 + n), &nnet);
-        combiner.AcceptNnet(nnet);
+        double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet,
+            &moving_average_nnet, prob_computer);
+        KALDI_LOG << "Combining last " << n + 1
+                  << " models, objective function is " << objf;
+        if (objf > best_objf) {
+          best_objf = objf;
+          best_nnet = moving_average_nnet;
+          best_n = n + 1;
+        }
       }
-      combiner.Combine();
 
 #if HAVE_CUDA==1
       CuDevice::Instantiate().PrintProfile();
 #endif
-      nnet = combiner.GetNnet();
       if (HasBatchnorm(nnet))
-        RecomputeStats(egs, &nnet);
-      WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
+        RecomputeStats(egs, &best_nnet);
+      WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
+      KALDI_LOG << "Using the model averaged over last " << best_n
+                << " models, objective function is " << best_objf;
     } else {
       KALDI_LOG << "Copying the single input model directly to the output, "
                 << "without any combination.";

From f41b0a2f3a6de583a92becd6e790364f0836edaa Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Sun, 10 Dec 2017 15:38:23 -0500
Subject: [PATCH 004/184] fix

---
 src/chainbin/nnet3-chain-combine.cc | 5 ++---
 src/nnet3bin/nnet3-combine.cc       | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 9100fbe3132..7dece5cb070 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -175,6 +175,8 @@ int main(int argc, char *argv[]) {
         best_n = n + 1;
       }
     }
+    KALDI_LOG << "Using the model averaged over last " << best_n
+              << " models, objective function is " << best_objf;
 
     if (HasBatchnorm(nnet))
       RecomputeStats(egs, chain_config, den_fst, &best_nnet);
@@ -184,9 +186,6 @@ int main(int argc, char *argv[]) {
 #endif
 
     WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
-    KALDI_LOG << "Using the model averaged over last " << best_n
-              << " models, objective function is " << best_objf;
-
     KALDI_LOG << "Finished combining neural nets, wrote model to "
               << nnet_wxfilename;
   } catch(const std::exception &e) {
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index e4181d5133b..5d67715a228 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -162,6 +162,8 @@ int main(int argc, char *argv[]) {
           best_n = n + 1;
         }
       }
+      KALDI_LOG << "Using the model averaged over last " << best_n
+                << " models, objective function is " << best_objf;
 
 #if HAVE_CUDA==1
       CuDevice::Instantiate().PrintProfile();
@@ -169,8 +171,6 @@ int main(int argc, char *argv[]) {
       if (HasBatchnorm(nnet))
         RecomputeStats(egs, &best_nnet);
       WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
-      KALDI_LOG << "Using the model averaged over last " << best_n
-                << " models, objective function is " << best_objf;
     } else {
       KALDI_LOG << "Copying the single input model directly to the output, "
                 << "without any combination.";

From 441c5df446de7a5944dbdeb0bf9c09fe30568e88 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 6 Dec 2017 17:12:30 -0500
Subject: [PATCH 005/184] [src] Nnet3 changes: add MemoryNormComponent; move
 BatchNorm,Normalize component code.

---
 src/nnet3/Makefile                    |    2 +-
 src/nnet3/nnet-component-itf.cc       |    3 +
 src/nnet3/nnet-component-itf.h        |   15 +-
 src/nnet3/nnet-normalize-component.cc | 1217 +++++++++++++++++++++++++
 src/nnet3/nnet-normalize-component.h  |  541 +++++++++++
 src/nnet3/nnet-parse.cc               |    7 +-
 src/nnet3/nnet-parse.h                |    4 +-
 src/nnet3/nnet-simple-component.cc    |  656 -------------
 src/nnet3/nnet-simple-component.h     |  256 +-----
 src/nnet3/nnet-test-utils.cc          |   10 +
 src/nnet3/nnet-utils.cc               |    1 +
 11 files changed, 1794 insertions(+), 918 deletions(-)
 create mode 100644 src/nnet3/nnet-normalize-component.cc
 create mode 100644 src/nnet3/nnet-normalize-component.h

diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 3236c52d60f..51dade98831 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -15,7 +15,7 @@ TESTFILES = natural-gradient-online-test nnet-graph-test \
   nnet-common-test convolution-test attention-test
 
 OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
-  nnet-simple-component.o \
+  nnet-simple-component.o nnet-normalize-component.o \
   nnet-general-component.o nnet-parse.o natural-gradient-online.o \
   nnet-descriptor.o nnet-optimize.o nnet-computation.o \
   nnet-computation-graph.o nnet-graph.o am-nnet-simple.o \
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 82010fea58d..f83ad26f375 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -23,6 +23,7 @@
 #include <iomanip>
 #include "nnet3/nnet-component-itf.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
 #include "nnet3/nnet-attention-component.h"
@@ -163,6 +164,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new LstmNonlinearityComponent();
   } else if (component_type == "BatchNormComponent") {
     ans = new BatchNormComponent();
+  } else if (component_type == "MemoryNormComponent") {
+    ans = new MemoryNormComponent();
   } else if (component_type == "TimeHeightConvolutionComponent") {
     ans = new TimeHeightConvolutionComponent();
   } else if (component_type == "RestrictedAttentionComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 62e09cee80f..565a7f25e74 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -351,20 +351,23 @@ class Component {
   ///     although most components will have much more info.
   virtual std::string Info() const;
 
-  /// This virtual function when called by
-  //    -- an UpdatableComponent scales the parameters
+  /// This virtual function when called on
+  ///    -- an UpdatableComponent scales the parameters
   ///      by "scale" when called by an UpdatableComponent.
-  //    -- a Nonlinear component (or another component that
-  ///      stores stats, like BatchNormComponent-- it relates
+  ///    -- a Nonlinear component (or another component that
+  ///      stores stats, like BatchNormComponent)-- it relates
   ///      to scaling activation stats, not parameters.
+  /// Otherwise it will normally do nothing.
   virtual void Scale(BaseFloat scale) {};
 
   /// This virtual function when called by
   ///    -- an UpdatableComponent adds the parameters of
   ///      another updatable component, times some constant, to the current
   ///      parameters.
-  ///    -- a NonlinearComponent it relates to adding stats
-  /// Otherwise it should do nothing.
+  ///    -- a NonlinearComponent (or another component that stores
+  ///       stats, like BatchNormComponent)-- it relates to adding
+  ///       stats.
+  /// Otherwise it will normally do nothing.
   virtual void Add(BaseFloat alpha, const Component &other) {};
 
   /// This virtual function only needs to be overwritten by Components that
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
new file mode 100644
index 00000000000..ac3817adfbe
--- /dev/null
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -0,0 +1,1217 @@
+// nnet3/nnet-normalize-component.cc
+
+// Copyright      2015-2017  Johns Hopkins University (author: Daniel Povey)
+//                2015  Guoguo Chen
+//                2015  Daniel Galvez
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iterator>
+#include <sstream>
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-normalize-component.h"
+#include "nnet3/nnet-parse.h"
+#include "cudamatrix/cu-math.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+const BaseFloat NormalizeComponent::kSquaredNormFloor =
+    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
+
+NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
+    input_dim_(other.input_dim_), block_dim_(other.block_dim_),
+    target_rms_(other.target_rms_),
+    add_log_stddev_(other.add_log_stddev_) { }
+
+void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
+  input_dim_ = 0;
+  add_log_stddev_ = false;
+  target_rms_ = 1.0;
+  bool ok = cfl->GetValue("dim", &input_dim_) ||
+      cfl->GetValue("input-dim", &input_dim_);
+  block_dim_ = input_dim_;
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("target-rms", &target_rms_);
+  cfl->GetValue("add-log-stddev", &add_log_stddev_);
+  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 ||
+      block_dim_ <= 0 || input_dim_ % block_dim_ != 0)
+    KALDI_ERR << "Invalid initializer for layer of type "
+              << Type() << ": \"" << cfl->WholeLine() << "\"";
+}
+
+void NormalizeComponent::Read(std::istream &is, bool binary) {
+  std::string token;
+  ReadToken(is, binary, &token);
+  if (token == "<NormalizeComponent>") {
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
+  ReadBasicType(is, binary, &input_dim_); // Read dimension.
+  ReadToken(is, binary, &token);
+  if (token == "<BlockDim>") {
+    ReadBasicType(is, binary, &block_dim_);
+    ReadToken(is, binary, &token);
+  } else {
+    block_dim_ = input_dim_;
+  }
+  // read target_rms_ if it is available.
+  if (token == "<TargetRms>") {
+    ReadBasicType(is, binary, &target_rms_);
+    ReadToken(is, binary, &token);
+  }
+  //  Read add_log_stddev_ token, if it is available.
+  if (token == "<AddLogStddev>") {
+    ReadBasicType(is, binary, &add_log_stddev_);
+    ReadToken(is, binary, &token);
+  }
+  if (token == "<ValueAvg>") {
+    // back-compatibility code.
+    CuVector<double> temp;
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<DerivAvg>");
+    temp.Read(is, binary);
+    ExpectToken(is, binary, "<Count>");
+    double count;
+    ReadBasicType(is, binary, &count);
+    ReadToken(is, binary, &token);
+  }
+  KALDI_ASSERT(token == "</NormalizeComponent>");
+}
+
+void NormalizeComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<NormalizeComponent>");
+  WriteToken(os, binary, "<InputDim>");
+  WriteBasicType(os, binary, input_dim_);
+  if (block_dim_ != input_dim_) {
+    WriteToken(os, binary, "<BlockDim>");
+    WriteBasicType(os, binary, block_dim_);
+  }
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<AddLogStddev>");
+  WriteBasicType(os, binary, add_log_stddev_);
+  WriteToken(os, binary, "</NormalizeComponent>");
+}
+
+std::string NormalizeComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", input-dim=" << InputDim()
+         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
+         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
+  if (block_dim_ != input_dim_)
+    stream << ", block-dim=" << block_dim_;
+  return stream.str();
+}
+
+// The output y_i = scale * x_i,
+// and we want to RMS value of the y_i to equal target_rms,
+// so y^t y = D * target_rms^2 (if y is one row of the input).
+// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
+// there is also flooring involved, to avoid division-by-zero
+// problems.  It's important for the backprop, that the floor's
+// square root is exactly representable as float.
+// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
+// is an extra dimension of the output.
+void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                   const CuMatrixBase<BaseFloat> &in,
+                                   CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() &&
+               in.NumRows() == out->NumRows());
+  if (block_dim_ != input_dim_) {
+    int32 num_blocks = input_dim_ / block_dim_,
+        new_num_rows = in.NumRows() * num_blocks,
+        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_num_rows,
+                                       block_dim_, block_dim_),
+        out_reshaped(out->Data(), new_num_rows,
+                     output_block_dim, output_block_dim);
+    cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_,
+                        &out_reshaped);
+  } else {
+    cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out);
+  }
+  return NULL;
+}
+
+/*
+  A note on the derivative of NormalizeComponent...
+  let both row_in and row_out be vectors of dimension D.
+  Let p = row_in^T row_in / (D * target_rms^2), and let
+  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
+  row_out = f row_in.
+  Suppose we have a quantity deriv_out which is the derivative
+  of the objective function w.r.t. row_out.  We want to compute
+  deriv_in which is the derivative of the objective function w.r.t.
+  row_in.  Let the objective function be F.  One term is obvious: we have
+  deriv_in = f deriv_out + ....
+  next we have to take into account the derivative that gets back-propagated
+  through f.  Obviously, dF/df = deriv_out^T row_in.
+  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
+  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
+  So this term in dF/d(row_in) equals:
+  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
+  So
+  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
+
+  if add_log_stddev_ true, the deriv_in has another term as
+  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
+*/
+void NormalizeComponent::Backprop(const std::string &debug_info,
+                                  const ComponentPrecomputedIndexes *indexes,
+                                  const CuMatrixBase<BaseFloat> &in_value,
+                                  const CuMatrixBase<BaseFloat> &, // out_value
+                                  const CuMatrixBase<BaseFloat> &out_deriv,
+                                  void *memo,
+                                  Component *to_update,
+                                  CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (!in_deriv)
+    return;
+  if (block_dim_ != input_dim_) {
+    int32 num_blocks = input_dim_ / block_dim_,
+        new_num_rows = in_value.NumRows() * num_blocks,
+        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
+    KALDI_ASSERT(in_value.Stride() == in_value.NumCols() &&
+                 out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(), new_num_rows,
+                                             block_dim_, block_dim_),
+        out_deriv_reshaped(out_deriv.Data(), new_num_rows,
+                           output_block_dim, output_block_dim),
+        in_deriv_reshaped(in_deriv->Data(), new_num_rows,
+                          block_dim_, block_dim_);
+    cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_,
+                            add_log_stddev_, &in_deriv_reshaped);
+  } else {
+    cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
+                            in_deriv);
+  }
+}
+
+void BatchNormComponent::ComputeDerived() {
+  if (!test_mode_) {
+    offset_.Resize(0);
+    scale_.Resize(0);
+    return;
+  }
+
+  if (count_ == 0.0) {
+    KALDI_WARN << "Test-mode is set but there is no data count.  "
+        "Creating random counts.  This only makes sense "
+        "in unit-tests (or compute_prob_*.0.log).  If you see this "
+        "elsewhere, something is very wrong.";
+    count_ = 1.0;
+    stats_sum_.SetRandn();
+    stats_sumsq_.SetRandn();
+    stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
+  }
+
+  offset_.Resize(block_dim_);
+  scale_.Resize(block_dim_);
+  offset_.CopyFromVec(stats_sum_);
+  offset_.Scale(-1.0 / count_);
+  // now offset_ is -mean.
+  scale_.CopyFromVec(stats_sumsq_);
+  scale_.Scale(1.0 / count_);
+  scale_.AddVecVec(-1.0, offset_, offset_, 1.0);
+  // now scale_ is variance.
+  // Mathematically the ApplyFloor statement should be a no-op; this is in case
+  // of numerical roundoff.
+  scale_.ApplyFloor(0.0);
+  scale_.Add(epsilon_);
+  scale_.ApplyPow(-0.5);
+  // now scale_ = min(variance, epsilon)^{-0.5}.
+  // next, multiply by the target RMS (normally 1.0).
+  scale_.Scale(target_rms_);
+  offset_.MulElements(scale_);
+  // now offset_ is -(scale*mean).
+}
+
+void BatchNormComponent::SetTestMode(bool test_mode) {
+  test_mode_ = test_mode;
+  ComputeDerived();
+}
+
+void BatchNormComponent::Check() const {
+  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+               epsilon_ > 0.0 && target_rms_ > 0.0);
+}
+
+BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
+    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
+    target_rms_(other.target_rms_), test_mode_(other.test_mode_),
+    count_(other.count_), stats_sum_(other.stats_sum_),
+    stats_sumsq_(other.stats_sumsq_) {
+  ComputeDerived();
+  Check();
+}
+
+
+std::string BatchNormComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
+         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
+         << ", count=" << count_
+         << ", test-mode=" << (test_mode_ ? "true" : "false");
+  if (count_ > 0) {
+    Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
+    mean.Scale(1.0 / count_);
+    var.Scale(1.0 / count_);
+    // subtract mean^2 from var.
+    var.AddVecVec(-1.0, mean, mean, 1.0);
+    var.ApplyFloor(0.0);
+    var.ApplyPow(0.5);  // make it the stddev.
+    stream << ", data-mean=" << SummarizeVector(mean)
+           << ", data-stddev=" << SummarizeVector(var);
+  }
+  return stream.str();
+}
+
+void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = -1;
+  block_dim_ = -1;
+  epsilon_ = 1.0e-03;
+  target_rms_ = 1.0;
+  test_mode_ = false;
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("epsilon", &epsilon_);
+  cfl->GetValue("target-rms", &target_rms_);
+  cfl->GetValue("test-mode", &test_mode_);
+  if (!ok || dim_ <= 0) {
+    KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0";
+  }
+  if (block_dim_ == -1)
+    block_dim_ = dim_;
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+        epsilon_ > 0 && target_rms_ > 0))
+    KALDI_ERR << "Invalid configuration in BatchNormComponent.";
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  count_ = 0;
+  stats_sum_.Resize(block_dim_);
+  stats_sumsq_.Resize(block_dim_);
+  if (test_mode_) {
+    ComputeDerived();
+  }
+}
+
+
+
+/*
+  BATCH_NORM_MATH
+
+  This comment describes the equations involved in batch normalization, and
+  derives the forward and back-propagation.
+
+  This is all dimension-by-dimension, so we just imagine the inputs
+  are scalars x(i), for i=0 .. n-1.
+
+  FORWARD PASS:
+
+  Define xsum  = sum_i x(i)
+         x2sum = sum_i x(i)^2
+          mean = xsum / n
+           var = x2sum / n - (mean*mean)
+         scale = sqrt(var + epsilon)^{-0.5}
+        offset = -mean * scale
+
+      y(i) = scale * x(i) + offset
+
+   Most of the rest of this comment derives how to compute the derivatives.  If
+   you just want the formulas, please skip to the string 'BACKWARD PASS' below.
+
+  We'll use a notation where an apostrophe on something means (the derivative of
+  the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
+  We are given y'(i).  Propagating the derivatives backward:
+     offset' = sum_i y'(i)
+     scale' = (sum_i y'(i) * x(i)) - offset' * mean
+       var' = scale' * -0.5 * sqrt(var + epsilon)^{-1.5}
+            = -0.5 * scale' * scale^3
+      mean' = -offset' * scale - 2 * mean * var'
+      xsum' = mean' / n
+     x2sum' = var' / n
+
+  So the derivatives propagated back to the original data are:
+     x'(i) = y'(i) * scale  +  xsum'  +  x(i) * x2sum'
+
+  The above is quite complicated to compute, but we can use some invariances
+  to work out a simpler way to compute the derivatives.
+
+  Firstly, note that x'(i) is of the form:
+
+   x'(i) =  y'(i) * scale + [affine function of x(i)].
+
+   [it's a 1-d affine function, i.e. offset and scale].
+ This has the same functional form as:
+
+  x'(i) =  y'(i) * scale + [affine function of y(i)].
+
+  since y(i) is an affine function of x(i) with nonzero scale.
+  Because the output is invariant to shifts in the input, sum_i x'(i)
+  will be zero.  This is sufficient to determine the bias
+  term in the affine function.  [Note: the scale on y(i) doesn't
+  come into it because the y(i) sum to zero].  The offset
+  will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero.
+  So let's write it as
+
+    x'(i) =  (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i).
+
+  and it will be convenient to define:
+
+  x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
+
+  which is just y'(i) with mean subtraction, scaled according to
+  the scale used in the normalization.  So write
+
+   x'(i) = x_deriv_base(i) + alpha y(i).
+
+ The question is, what is the scale alpha.  We don't actually need to
+ do any differentiation to figure this out.  First, assume there is
+ no "+ epsilon" in the variance; later we'll explain why this doesn't
+ matter.  The key to working out alpha is that the output is invariant
+ to scaling of the input.  Assume we scale around the input's mean,
+ since that makes the math simpler.  We can express this by the
+ constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0.  This is
+ equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since
+ y(i) is x(i) - avg-x times a nonzero scale.  We'll use this contraint
+ to determine alpha, Using the above expressionfor x(i), we can write
+ this constraint as:
+   \sum_i ( y(i) x_deriv_base(i)  + alpha y(i) y(i)) = 0.
+ Now, since we said we'd ignore the epsilon, the output has unit variance,
+ so we know that \sum_i y(i) y(i) = n.
+ So alpha = - \sum_i y(i) x_deriv_base(i) / n.  We can actually re-imagine
+ the epsilon term (or variance-flooring) as having been implemented by
+ adding a couple extra rows to the matrix with suitable values, and zero
+ output-deriv for those rows.  If you think about it carefully you'll see that
+ the formula above is valid even if there is an extra term
+ in the variance.  Anyway the correctness of the derivative will get tested
+ throughly by the component unit-tests.
+
+ So to recap, here is the backprop.
+
+ BACKWARD PASS:
+
+  We are given y'(i), scale, and y(i).
+
+  We compute:
+    x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
+              alpha = - \sum_i y(i) x_deriv_base(i) / n
+              x'(i) = x_deriv_base(i) + alpha y(i)
+  */
+
+
+
+void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                    const CuMatrixBase<BaseFloat> &in,
+                                    CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(SameDim(in, *out) &&
+               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
+  if (in.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
+        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
+        new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
+        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
+    return Propagate(indexes, in_reshaped, &out_reshaped);
+  }
+
+  // From this point, we can assume that the num-cols of 'in' and 'out'
+  // equals block_dim_.
+
+  if (!test_mode_) {
+    // search in the comment above for FORWARD PASS to see what is being
+    // implemented here.
+    // if this takes too much time due to multiple different CUDA calls,
+    // we'll consider making a single kernel for some of it.
+    Memo *memo = new Memo;
+    int32 num_frames = in.NumRows(), dim = block_dim_;
+    memo->num_frames = num_frames;
+    memo->mean_uvar_scale.Resize(4, dim);
+    CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
+        uvar(memo->mean_uvar_scale, 1),
+        scale(memo->mean_uvar_scale, 2);
+    mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
+    uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
+    scale.CopyFromVec(uvar);
+    // by applying this scale at this point, we save a multiply later on.
+    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
+    scale.AddVecVec(-var_scale, mean, mean, var_scale);
+    // at this point, 'scale' contains just the variance [divided by target-rms^2].
+    scale.ApplyFloor(0.0);
+    scale.Add(var_scale * epsilon_);
+    // Now 'scale' contains the variance floored to zero and then with epsilon
+    // added [both divided by target-rms^2].
+    scale.ApplyPow(-0.5);
+    // now 'scale' is the actual scale we'll use.
+
+    // the next command will do no work if out == in, for in-place propagation.
+    out->CopyFromMat(in);
+    out->AddVecToRows(-1.0, mean, 1.0);
+    out->MulColsVec(scale);
+    return static_cast<void*>(memo);
+  } else {
+    if (offset_.Dim() != block_dim_) {
+      if (count_ == 0)
+        KALDI_ERR << "Test mode set in BatchNormComponent, but no stats.";
+      else  // why was ComputeDerived() not called?
+        KALDI_ERR << "Code error in BatchNormComponent";
+    }
+    out->CopyFromMat(in);
+    out->MulColsVec(scale_);
+    out->AddVecToRows(1.0, offset_, 1.0);
+    return NULL;
+  }
+}
+
+void BatchNormComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,  // unused
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo_in,
+    Component *to_update,  // unused
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
+               SameDim(out_value, *in_deriv) &&
+               (out_value.NumCols() == dim_ ||
+                out_value.NumCols() == block_dim_));
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
+                 out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols),
+        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
+        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
+    // we'll never use in_value, so pass it in unchanged.
+    Backprop(debug_info, indexes, in_value,
+             out_value_reshaped, out_deriv_reshaped,
+             memo_in, to_update, &in_deriv_reshaped);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+
+  if (!test_mode_) {
+    // search above for BACKWARD PASS for a comment describing the math.
+    KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
+    int32 num_frames = memo->num_frames;
+    KALDI_ASSERT(out_value.NumRows() == num_frames);
+    CuSubVector<BaseFloat> temp(memo->mean_uvar_scale, 3),
+        scale(memo->mean_uvar_scale, 2);
+    temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
+    // the following does no work if in_deriv and out_deriv are the same matrix.
+    in_deriv->CopyFromMat(out_deriv);
+    in_deriv->AddVecToRows(1.0, temp);
+    in_deriv->MulColsVec(scale);
+    // at this point, 'in_deriv' contains:
+    // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
+    temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_),
+                       out_value, kTrans, *in_deriv, kNoTrans, 0.0);
+    // now, 'temp' contains the quantity which we described
+    // in the math as:
+    // alpha = - \sum_i y(i) x_deriv_base(i) / n.
+    // The factor 1 / (target_rms_ * target_rms_) comes from following
+    // this additional scaling factor through the math.  In the comment I said
+    // "we know that \sum_i y(i) y(i) = n".  Taking target-rms into account
+    // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2".
+    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0);
+    // At this point, in_deriv contains  x'(i) = x_deriv_base(i) + alpha y(i).
+
+  } else {
+    KALDI_ASSERT(offset_.Dim() == block_dim_);
+    // the next call does no work if they point to the same memory.
+    in_deriv->CopyFromMat(out_deriv);
+    in_deriv->MulColsVec(scale_);
+  }
+}
+
+void BatchNormComponent::StoreStats(
+    const CuMatrixBase<BaseFloat> &in_value,
+    const CuMatrixBase<BaseFloat> &out_value,
+    void *memo_in) {
+  // in test mode this component does not store stats, it doesn't provide the
+  // kStoresStats flag.
+  KALDI_ASSERT(!test_mode_);
+  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
+  if (out_value.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                              new_cols, new_cols);
+    // we'll never use in_value, so just pass it in unchanged.
+    StoreStats(in_value, out_value_reshaped, memo_in);
+    return;
+  }
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
+
+  CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
+      uvar(memo->mean_uvar_scale, 1);
+  KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0);
+  BaseFloat num_frames = memo->num_frames;
+  if (stats_sum_.Dim() != block_dim_) {
+    stats_sum_.Resize(block_dim_);
+    stats_sumsq_.Resize(block_dim_);
+    KALDI_ASSERT(count_ == 0);
+  }
+  count_ += num_frames;
+  stats_sum_.AddVec(num_frames, mean, 1.0);
+  stats_sumsq_.AddVec(num_frames, uvar, 1.0);
+}
+
+void BatchNormComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<BatchNormComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<Epsilon>");
+  ReadBasicType(is, binary, &epsilon_);
+  ExpectToken(is, binary, "<TargetRms>");
+  ReadBasicType(is, binary, &target_rms_);
+  ExpectToken(is, binary, "<TestMode>");
+  ReadBasicType(is, binary, &test_mode_);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
+  ExpectToken(is, binary, "<StatsMean>");
+  stats_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<StatsVar>");
+  stats_sumsq_.Read(is, binary);
+  stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
+  stats_sum_.Scale(count_);
+  stats_sumsq_.Scale(count_);
+  ExpectToken(is, binary, "</BatchNormComponent>");
+  ComputeDerived();
+  Check();
+}
+
+void BatchNormComponent::Write(std::ostream &os, bool binary) const {
+  Check();
+  WriteToken(os, binary, "<BatchNormComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<Epsilon>");
+  WriteBasicType(os, binary, epsilon_);
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary,  count_);
+  CuVector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
+  if (count_ != 0) {
+    mean.Scale(1.0 / count_);
+    var.Scale(1.0 / count_);
+    var.AddVecVec(-1.0, mean, mean, 1.0);
+  }
+  WriteToken(os, binary, "<StatsMean>");
+  mean.Write(os, binary);
+  WriteToken(os, binary, "<StatsVar>");
+  var.Write(os, binary);
+  WriteToken(os, binary, "</BatchNormComponent>");
+}
+
+void BatchNormComponent::Scale(BaseFloat scale) {
+  if (scale == 0) {
+    count_ = 0.0;
+    stats_sum_.SetZero();
+    stats_sumsq_.SetZero();
+  } else {
+    count_ *= scale;
+    stats_sum_.Scale(scale);
+    stats_sumsq_.Scale(scale);
+  }
+}
+
+
+void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const BatchNormComponent *other =
+      dynamic_cast<const BatchNormComponent*>(&other_in);
+  count_ += alpha * other->count_;
+  stats_sum_.AddVec(alpha, other->stats_sum_);
+  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
+  // this operation might change offset_ and scale_, so we recompute them
+  // in this instance (but not in Scale()).
+  ComputeDerived();
+}
+
+void BatchNormComponent::ZeroStats() {
+  // We only zero the stats if we're not in test mode.  In test mode, this would
+  // be dangerous as the stats are the source for the transform, and zeroing
+  // them and then calling ComputeDerived() again would remove the transform
+  // parameters (offset_ and scale_).
+  if (!test_mode_) {
+    count_ = 0.0;
+    stats_sum_.SetZero();
+    stats_sumsq_.SetZero();
+  }
+}
+
+
+
+
+/**
+   MEMORY_NORM_MATH
+
+   This comment describes the equations involved in 'memory-norm'.
+   memory-norm is like batch normalization, except instead of computing
+   everything on the current minibatch, we deal with decaying averages
+   over time, interpreted as expectations.  We'll firm up the math later.
+   The idea is to obtain a form of batch-norm that is compatible with
+   use in recurrent neural nets.
+
+   Everything is dimension by dimension here, so let's imagine the input and
+   output are one-dimensional.  Any index 'i' is going to be like a frame index
+   or an index referring to a sample.  We'll be writing down some expectations,
+   and we're rather cavalier with notation; these basically mean
+   exponentially-decaying weighted averages over time.
+
+   The input will be x(i), and the output y(i).
+
+   Each frame will have a weight, w(i) >= 0.  (these will be part of the
+   decaying averages)...
+
+   Let's define
+      count = \sum_i w(i)
+      sum =  \sum_i w(i) x(i)
+      sumsq =  \sum_i w(i) x(i)^2
+
+   We can compute:
+      mean = sum / count
+      var = epsilon + (sumsq / count) - (mean * mean)
+      scale = var^{-0.5}
+
+      y(i) = (x(i) - mean) * scale.
+
+   We are given the derivatives of the objective function w.r.t. the
+   outputs; we'll write these as y'(i) [CAUTION: this is nonstandard
+   notation.  An apostrophe on something means the derivative of the
+   objective function w.r.t. that thing].
+
+   Over this data, with these weights, we can compute the derivative
+   of the objective w.r.t. the mean and the scale:
+
+       mean' = -scale * \sum_i w(i) y'(i)
+      scale' = \sum_i w(i) y'(i) (x(i) - mean)
+             = 1/scale \sum_i w(i) y'(i) y(i)
+        var' = -0.5 var^{-1.5} scale'
+             = -0.5 var^{-1} \sum_i w(i) y'(i) y(i)
+
+   It will be convenient to write down 'per-frame' versions of all of these
+   quantities, which are divided by the total count:
+        mean_norm' = mean' / count
+        scale_norm' = scale' / count
+        var_norm' = var' / count
+   (we keep the apostrophe on these quantities as it clarifies that they
+   are derivatives of the objective function w.r.t something).
+
+    Now, 'var' can be written as:
+        var = epsilon + (1/count) \sum_i w(i) (x(i) - mean)^2
+    and the following formula is more convenient to propagate the derivative
+    back to an x(i).
+        Note: the following has 3 terms, which we can think of as
+     "direct term" (given fixed mean and scale),
+     "term via mean" (term that comes via derivative of the mean)
+     "term via scale" (term that comes via derivative of the scale)
+
+
+        x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean)
+              = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale
+              = y'(i)*scale + mean_norm' - y(i) * scale/count * \sum_i w(i) y'(i) y(i)
+
+    I'm afraid I just pulled the above out of thin air... needs some more
+    derivation. The part about (x(i) - mean) can be obtained, I believe,
+    from computation of the derivative of the variance w.r.t. the x(i) values.
+
+*/
+
+
+void MemoryNormComponent::SetTestMode(bool test_mode) {
+  test_mode_ = test_mode;
+}
+
+void MemoryNormComponent::Check() const {
+  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+               epsilon_ > 0.0 && target_rms_ > 0.0 &&
+               stats_count_ >= 0.0 && backward_count_ >= 0.0);
+
+}
+
+MemoryNormComponent::MemoryNormComponent(const MemoryNormComponent &other):
+    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
+    target_rms_(other.target_rms_),
+    include_indirect_derivative_(other.include_indirect_derivative_),
+    test_mode_(other.test_mode_),
+    stats_count_(other.stats_count_), backward_count_(other.backward_count_),
+    data_(other.data_) {
+  Check();
+}
+
+
+std::string MemoryNormComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
+         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
+         << ", include-indirect-derivative="
+         << (include_indirect_derivative_ ? "true" : "false")
+         << ", stats-count=" << stats_count_ << ", backward-count="
+         << backward_count_
+         << ", test-mode=" << (test_mode_ ? "true" : "false");
+  if (stats_count_ > 0.0) {
+    CuSubVector<BaseFloat> x_mean(data_, 0),
+        y_deriv(data_, 2), y_deriv_y(data_, 3),
+        scale(data_, 4), x_deriv(data_, 5),
+        scale_deriv(data_, 6);
+    if (stats_count_ > 0.0)
+      stream << ", x-mean=" << SummarizeVector(x_mean)
+             << ", scale=" << SummarizeVector(scale);
+    if (backward_count_ > 0.0)
+      stream << ", y-deriv=" << SummarizeVector(y_deriv)
+             << ", y-deriv-y=" << SummarizeVector(y_deriv_y)
+             << ", x-deriv=" << SummarizeVector(x_deriv)
+             << ", scale-deriv=" << SummarizeVector(scale_deriv);
+  }
+  return stream.str();
+}
+
+void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = -1;
+  block_dim_ = -1;
+  epsilon_ = 1.0e-03;
+  target_rms_ = 1.0;
+  include_indirect_derivative_ = true;
+  test_mode_ = false;
+
+  bool ok = cfl->GetValue("dim", &dim_);
+  cfl->GetValue("block-dim", &block_dim_);
+  cfl->GetValue("epsilon", &epsilon_);
+  cfl->GetValue("target-rms", &target_rms_);
+  cfl->GetValue("include-indirect-derivative", &include_indirect_derivative_);
+  cfl->GetValue("test-mode", &test_mode_);
+  if (!ok || dim_ <= 0) {
+    KALDI_ERR << "MemoryNormComponent must have 'dim' specified, and > 0";
+  }
+  if (block_dim_ == -1)
+    block_dim_ = dim_;
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
+        epsilon_ > 0 && target_rms_ > 0))
+    KALDI_ERR << "Invalid configuration in MemoryNormComponent.";
+  if (cfl->HasUnusedValues())
+    KALDI_ERR << "Could not process these elements in initializer: "
+              << cfl->UnusedValues();
+  stats_count_ = 0.0;
+  backward_count_ = 0.0;
+  data_.Resize(7, block_dim_);
+}
+
+
+
+void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
+                                     const CuMatrixBase<BaseFloat> &in,
+                                     CuMatrixBase<BaseFloat> *out) const {
+  KALDI_ASSERT(SameDim(in, *out) &&
+               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
+  if (in.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
+    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
+        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
+        new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
+        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
+    return Propagate(indexes, in_reshaped, &out_reshaped);
+  }
+
+  if (out->Data() != in.Data())
+    out->CopyFromMat(in);
+
+  // From this point, we can assume that the num-cols of 'in' and 'out'
+  // equals block_dim_.
+  if (test_mode_) {
+    if (stats_count_ <= 0.0)
+      KALDI_ERR << "Test mode set but no stats available.";
+    CuSubVector<BaseFloat> x_mean(data_, 3), scale(data_, 4);
+    out->AddVecToRows(-1.0, x_mean);
+    out->MulColsVec(scale);
+    return NULL;
+  } else {
+    Memo *memo = GetMemo(in);
+    CuSubVector<BaseFloat> x_sum(memo->data, 0),
+        scale(memo->data, 2);
+    out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
+    out->MulColsVec(scale);
+    return memo;
+  }
+}
+
+
+MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
+    const CuMatrixBase<BaseFloat> &in) const {
+  KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_);
+  Memo *memo = new Memo;
+  int32 num_frames = in.NumRows();
+  memo->num_frames = num_frames;
+  memo->data.Resize(5, block_dim_);
+  CuSubVector<BaseFloat> x_sum(memo->data, 0),
+      x_sumsq(memo->data, 1);
+  x_sum.AddRowSumMat(1.0, in, 0.0);
+  x_sumsq.AddDiagMat2(1.0, in, kTrans, 0.0);
+  if (stats_count_ > 0.0) {
+    memo->has_indirect_terms = include_indirect_derivative_;
+    if (include_indirect_derivative_) {
+      // copy over scale, x_deriv and scale_deriv.
+      memo->data.RowRange(2, 3).CopyFromMat(data_.RowRange(4, 3));
+    } else {
+      // just copy over the scale.  x_deriv and scale_deriv remain zero.
+      memo->data.Row(2).CopyFromVec(data_.Row(4));
+    }
+  } else {
+    // We should only reach this point on when processing the first
+    // minibatch of each training job.
+
+    // note: 'x_deriv' and 'scale_deriv' will be zero.  This means we're
+    // ignoring the smaller, indirect term in the derivative for the first
+    // minibatch of each training job.  That indirect term is really not that
+    // important that we should worry much about this.
+    memo->has_indirect_terms = false;
+
+    CuSubVector<BaseFloat> scale(memo->data, 2);
+    scale.CopyFromVec(x_sumsq);
+    scale.AddVecVec(-1.0 / (num_frames * 1.0 * num_frames),
+                    x_sum, x_sum, 1.0 / num_frames);
+    // At this point 'scale' is the variance.
+    // We apply the floor at 0.0 as a failsafe for problems caused by roundoff.
+    scale.ApplyFloor(0.0);
+    scale.Add(epsilon_);
+    // At this point 'scale' is the variance plus epsilon.
+    scale.ApplyPow(-0.5);
+    // OK, now 'scale' is the actual scale: the inverse standard deviation.
+  }
+  return memo;
+}
+
+void MemoryNormComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes,
+    const CuMatrixBase<BaseFloat> &in_value,  // unused.
+    const CuMatrixBase<BaseFloat> &out_value,
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo_in,
+    Component *to_update_in,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  KALDI_ASSERT(SameDim(out_deriv, *in_deriv) &&
+               (out_deriv.NumCols() == dim_ ||
+                out_deriv.NumCols() == block_dim_));
+  if (out_deriv.NumCols() != block_dim_) {
+    // if block_dim_ != dim_, we recurse; this helps keep the main code
+    // simple.
+    KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() &&
+                 in_deriv->Stride() == in_deriv->NumCols());
+    if (out_value.NumRows() != 0) {
+      KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
+    }
+    int32 ratio = dim_ / block_dim_,
+        orig_rows = out_value.NumRows(),
+        orig_cols = out_value.NumCols(),
+        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
+    CuSubMatrix<BaseFloat>
+        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
+        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
+
+    // we'll never use in_value, so pass it in unchanged.
+    if (out_value.NumRows() != 0) {
+      CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
+                                                new_cols, new_cols);
+      Backprop(debug_info, indexes, in_value,
+               out_value_reshaped, out_deriv_reshaped,
+               memo_in, to_update_in, &in_deriv_reshaped);
+    } else {
+      Backprop(debug_info, indexes, in_value,
+               out_value, out_deriv_reshaped,
+               memo_in, to_update_in, &in_deriv_reshaped);
+    }
+    return;
+  }
+
+  // assume in_deriv is non-NULL, because a non-updatable Component will not
+  // have the backprop called if the in_deriv is non-NULL.
+
+  if (test_mode_) {
+    KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0);
+    // the following is a no-op if in_deriv and out_deriv are the same matrix.
+    in_deriv->CopyFromMat(out_deriv);
+    CuSubVector<BaseFloat> scale(data_, 4);
+    in_deriv->MulColsVec(scale);
+    return;
+  }
+
+  // OK, we're not in test mode.
+  // Before computing 'in_deriv', we may need to store some stats.
+  if (include_indirect_derivative_ && to_update_in != NULL) {
+    // Store some stats which are necessary to compute the 'indirect derivative'
+    // term (this is analogous to the part of the derivative in regular backprop
+    // that comes from the objf derivative w.r.t. the mean and variance stats).
+    //
+    // Note: instead of simply adding to the stats 'y_deriv' and 'y_deriv_y',
+    // the following equations do a kind of weighted combination, because
+    // these stats are stored normalized by the total count (backward_count_).
+    MemoryNormComponent *to_update =
+        dynamic_cast<MemoryNormComponent*>(to_update_in);
+    BaseFloat backward_count = to_update->backward_count_,
+        num_frames = in_deriv->NumRows(),
+        new_backward_count = backward_count + num_frames,
+        old_weight = backward_count / new_backward_count;
+    CuSubVector<BaseFloat> y_deriv(to_update->data_, 2),
+        y_deriv_y(to_update->data_, 3);
+    // The factor 1.0 / new_backward_count that appears below can be perhaps more
+    // clearly written as follows: first define
+    //       new_weight = num_frames / new_backward_count
+    // and then write new_weight / num_frames, which simplifies to
+    // 1.0 / new_backward_count.  The factor of 1.0 / num_frames is necessary to
+    // convert from data sums to a per-frame average.
+    y_deriv.AddRowSumMat(1.0 / new_backward_count, out_deriv, old_weight);
+    y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans,
+                            out_value, kNoTrans, old_weight);
+    to_update->backward_count_ = new_backward_count;
+    // We don't bother calling to_update->ComputeDerived()-- although it would
+    // be harmless-- because in the current situations where this code is
+    // reached, to_update will be the delta_nnet_, and the derived parameters of
+    // delta_nnet_ aren't used.
+
+    // to_update->ComputeDerived();
+  }
+
+  // the following does no work if in_deriv and out_deriv are the same matrix.
+  in_deriv->CopyFromMat(out_deriv);
+
+  Memo *memo = static_cast<Memo*>(memo_in);
+  CuSubVector<BaseFloat> scale(memo->data, 2);
+  in_deriv->MulColsVec(scale);
+  if (memo->has_indirect_terms) {
+    CuSubVector<BaseFloat> x_deriv(memo->data, 3),
+        scale_deriv(memo->data, 4);
+    in_deriv->AddVecToRows(-1.0, x_deriv);
+    in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv);
+  }
+}
+
+
+void MemoryNormComponent::ComputeDerived() {
+  KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 7);
+  if (stats_count_ == 0.0) {
+    // zero 'scale', 'x_deriv' and 'scale_deriv'.
+    data_.RowRange(4, 3).SetZero();
+    return;
+  }
+  CuSubVector<BaseFloat>  x_mean(data_, 0), x_uvar(data_, 1),
+      y_deriv(data_, 2), y_deriv_y(data_, 3), scale(data_, 4);
+  scale.CopyFromVec(x_uvar);
+  scale.AddVecVec(-1.0, x_mean, x_mean, 1.0);
+  // at this point, 'scale' is the variance.
+  scale.ApplyFloor(0.0);
+  scale.Add(epsilon_);
+  scale.ApplyPow(-0.5);
+  if (backward_count_ == 0.0) {
+    // The following statement sets x_deriv and scale_deriv to zero.
+    data_.RowRange(5, 2).SetZero();
+  } else {
+    // The following statement sets x_deriv = y_deriv * scale,
+    // and scale_deriv = y_deriv_y * scale.
+    data_.RowRange(5, 2).AddMatDiagVec(1.0,
+        data_.RowRange(2, 2), kNoTrans, scale, 0.0);
+  }
+}
+
+void MemoryNormComponent::StoreStats(
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    void *memo_in) {
+  // in test mode this component does not store stats; it doesn't provide the
+  // kStoresStats flag so this function won't be called.
+  KALDI_ASSERT(!test_mode_ && memo_in != NULL && stats_count_ >= 0.0);
+
+  // We don't actually need 'in_value' and 'out_value', as the
+  // required statistics are already stored in 'memo_in'.
+  Memo *memo = static_cast<Memo*>(memo_in);
+
+  BaseFloat num_frames = memo->num_frames,
+      old_stats_count = stats_count_,
+      new_stats_count = num_frames + old_stats_count,
+      old_weight = old_stats_count / new_stats_count;
+
+  // x_mean_and_x_uvar is the first 2 rows of data_.
+  CuSubMatrix<BaseFloat> x_mean_and_x_uvar(data_, 0, 2, 0, block_dim_);
+  // x_sum_and_x_sumsq is the first 2 rows of data_.
+  CuSubMatrix<BaseFloat> x_sum_and_x_sumsq(memo->data, 0, 2, 0, block_dim_);
+
+  x_mean_and_x_uvar.Scale(old_weight);
+  // The factor 1.0 / new_stats_count that appears below can be perhaps more
+  // clearly written as follows: first define
+  //       new_weight = num_frames / new_stats_count
+  // and then write 'new_weight / num_frames', which simplifies to
+  // '1.0 / new_stats_count'.  The factor of '1.0 / num_frames'
+  // is necessary to convert from data sums to a per-frame average.
+  x_mean_and_x_uvar.AddMat(1.0 / new_stats_count, x_sum_and_x_sumsq);
+  stats_count_ = new_stats_count;
+  ComputeDerived();
+}
+
+void MemoryNormComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<MemoryNormComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<Epsilon>");
+  ReadBasicType(is, binary, &epsilon_);
+  ExpectToken(is, binary, "<TargetRms>");
+  ReadBasicType(is, binary, &target_rms_);
+  ExpectToken(is, binary, "<IncludeIndirectDerivative>");
+  ReadBasicType(is, binary, &include_indirect_derivative_);
+  ExpectToken(is, binary, "<TestMode>");
+  ReadBasicType(is, binary, &test_mode_);
+  ExpectToken(is, binary, "<StatsCount>");
+  ReadBasicType(is, binary, &stats_count_);
+  ExpectToken(is, binary, "<BackwardCount>");
+  ReadBasicType(is, binary, &backward_count_);
+  ExpectToken(is, binary, "<Data>");
+  data_.Read(is, binary);
+  Check();
+}
+
+void MemoryNormComponent::Write(std::ostream &os, bool binary) const {
+  Check();
+  WriteToken(os, binary, "<MemoryNormComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<Epsilon>");
+  WriteBasicType(os, binary, epsilon_);
+  WriteToken(os, binary, "<TargetRms>");
+  WriteBasicType(os, binary, target_rms_);
+  WriteToken(os, binary, "<IncludeIndirectDerivative>");
+  WriteBasicType(os, binary, include_indirect_derivative_);
+  WriteToken(os, binary, "<TestMode>");
+  WriteBasicType(os, binary, test_mode_);
+  WriteToken(os, binary, "<StatsCount>");
+  WriteBasicType(os, binary,  stats_count_);
+  WriteToken(os, binary, "<BackwardCount>");
+  WriteBasicType(os, binary,  backward_count_);
+  WriteToken(os, binary, "<Data>");
+  data_.Write(os, binary);
+  WriteToken(os, binary, "</MemoryNormComponent>");
+}
+
+void MemoryNormComponent::Scale(BaseFloat scale) {
+  if (scale <= 0) {
+    if (scale < 0.0)
+      KALDI_WARN << "Setting stats to zero in MemoryNormComponent: requested scale = "
+                 << scale;
+    // If scale is negative we zero the stats.  This may not always be the right
+    // thing to do, so we warn.
+    data_.SetZero();
+    stats_count_ = 0.0;
+    backward_count_ = 0.0;
+  } else {
+    stats_count_ *= scale;
+    backward_count_ *= scale;
+    // 'data_' doesnt need to be changed, as all the quantities it contains are
+    // normalized by the count.
+  }
+}
+
+
+void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) {
+  const MemoryNormComponent *other =
+      dynamic_cast<const MemoryNormComponent*>(&other_in);
+
+  static bool warned = false;
+  if (alpha < 0.0) {
+    if (!warned) {
+      warned = true;
+      KALDI_WARN << "Adding MemoryNormComponent with negative scale: will do nothing "
+                 << "(will not warn again).";
+    }
+    return;
+  }
+
+  BaseFloat
+      new_stats_count = stats_count_ + alpha * other->stats_count_,
+      new_backward_count = backward_count_ + alpha * other->backward_count_;
+
+  if (new_stats_count > 0.0) {
+    // This block sets rows 0 and 1 of data_, which we call 'x_mean' and
+    // 'x_uvar, to the appropriate weighted combination of 'this' and 'other'.
+    BaseFloat this_scale = stats_count_ / new_stats_count,
+        other_scale = alpha * other->stats_count_ / new_stats_count;
+    data_.RowRange(0, 2).Scale(this_scale);
+    data_.RowRange(0, 2).AddMat(other_scale, other->data_.RowRange(0, 2));
+  }
+  if (new_backward_count > 0.0) {
+    // This block sets rows 2 and 3 of data_, which we call 'y_deriv' and
+    // 'y_deriv_y', to the appropriate weighted combination of 'this' and
+    // 'other'.
+    BaseFloat this_scale = backward_count_ / new_backward_count,
+        other_scale = alpha * other->backward_count_ / new_backward_count;
+    data_.RowRange(2, 2).Scale(this_scale);
+    data_.RowRange(2, 2).AddMat(other_scale, other->data_.RowRange(2, 2));
+  }
+  stats_count_ = new_stats_count;
+  backward_count_ = new_backward_count;
+  ComputeDerived();
+}
+
+void MemoryNormComponent::ZeroStats() {
+  // We only zero the stats if we're not in test mode.  In test mode, this would
+  // be dangerous as the stats aren't really considered to be stats, they become
+  // a fixed part of the model.
+  if (!test_mode_) {
+    stats_count_ = 0.0;
+    backward_count_ = 0.0;
+    data_.SetZero();
+  }
+}
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
new file mode 100644
index 00000000000..68506174eb7
--- /dev/null
+++ b/src/nnet3/nnet-normalize-component.h
@@ -0,0 +1,541 @@
+// nnet3/nnet-normalize-component.h
+
+// Copyright 2011-2013  Karel Vesely
+//           2012-2015  Johns Hopkins University (author: Daniel Povey)
+//                2013  Xiaohui Zhang
+//           2014-2015  Vijayaditya Peddinti
+//           2014-2015  Guoguo Chen
+//                2015  Daniel Galvez
+//                2015  Tom Ko
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_
+#define KALDI_NNET3_NNET_NORMALIZE_COMPONENT_H_
+
+#include "nnet3/nnet-common.h"
+#include "nnet3/nnet-component-itf.h"
+#include "nnet3/natural-gradient-online.h"
+#include <iostream>
+
+namespace kaldi {
+namespace nnet3 {
+
+/// @file  nnet-normalize-component.h
+///
+///   This file contains declarations of components that in one way or
+///   another normalize their input: NormalizeComponent, BatchNormComponent,
+///   and MemoryNormComponent.
+
+/*
+   Implements the function:
+
+         y = x * (sqrt(dim(x)) * target-rms) / |x|
+
+    where |x| is the 2-norm of the vector x.  I.e. its output is its input
+    scaled such that the root-mean-square values of its elements equals
+    target-rms.  (As a special case, if the input is zero, it outputs zero).
+
+    Note: if you specify add-log-stddev=true, it adds an extra element to
+     y which equals log(|x| / sqrt(dim(x))).
+
+
+   Configuration values accepted:
+      dim, or input-dim    Input dimension of this component, e.g. 1024.
+                           Will be the same as the output dimension if add-log-stddev=false.
+      block-dim            Defaults to 'dim' you may specify a nonzero divisor
+                           of 'dim'.  In this case the input dimension will
+                           be interpreted as blocks of dimension 'block-dim'
+                           to which the nonlinearity described above is applied
+                           separately.
+      add-log-stddev       You can set this to true to add an extra output
+                           dimension which will equal |x| / sqrt(dim(x)).
+                           If block-dim is specified, this is done per block.
+      target-rms           This defaults to 1.0, but if set it to another
+                           (nonzero) value, the output will be scaled by this
+                           factor.
+ */
+class NormalizeComponent: public Component {
+ public:
+  explicit NormalizeComponent(const NormalizeComponent &other);
+
+  virtual int32 Properties() const {
+    return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds|
+        (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) |
+        (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0);
+  }
+  NormalizeComponent() { }
+  virtual std::string Type() const { return "NormalizeComponent"; }
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual Component* Copy() const { return new NormalizeComponent(*this); }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual int32 InputDim() const { return input_dim_; }
+  virtual int32 OutputDim() const {
+    return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0));
+  }
+  virtual std::string Info() const;
+ private:
+  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
+  enum { kExpSquaredNormFloor = -66 };
+  // kSquaredNormFloor is about 0.7e-20.  We need a value that's exactly representable in
+  // float and whose inverse square root is also exactly representable
+  // in float (hence, an even power of two).
+  static const BaseFloat kSquaredNormFloor;
+  int32 input_dim_;
+  int32 block_dim_;
+  BaseFloat target_rms_; // The target rms for outputs, default 1.0.
+
+  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
+                        // is an extra dimension of the output.
+};
+
+
+/*
+  BatchNormComponent
+
+  This implements batch normalization; for each dimension of the
+  input it normalizes the data to be zero-mean, unit-variance.  You
+  can set the block-dim configuration value to implement spatial
+  batch normalization, see the comment for the variable.
+
+  If you want to combine this with the trainable offset and scale that the
+  original BatchNorm paper used, then follow this by the
+  ScaleAndOffsetComponent.
+
+  It's a simple component (uses the kSimpleComponent flag), but it is unusual in
+  that it will give different results if you call it on half the matrix at a
+  time.  Most of the time this would be pretty harmless, so we still return the
+  kSimpleComponent flag.  We may have to modify the test code a little to
+  account for this, or possibly remove the kSimpleComponent flag.  In some sense
+  each output Index depends on every input Index, but putting those dependencies
+  explicitly into the dependency-tracking framework as a GeneralComponent
+  would be very impractical and might lead to a lot of unnecessary things being
+  computed.  You have to be a bit careful where you put this component, and understand
+  what you're doing e.g. putting it in the path of a recurrence is a bit problematic
+  if the minibatch size is small.
+
+    Accepted configuration values:
+           dim          Dimension of the input and output
+           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+                        of 'dim'.  In this case, each block of dimension 'block-dim'
+                        is treated like a separate row of the input matrix, which
+                        means that the stats from n'th element of each
+                        block are pooled into one class, for each n.a
+           epsilon      Small term added to the variance that is used to prevent
+                        division by zero
+           target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
+                        it will normalize the standard deviation of the output to
+                        2.0. 'target-stddev' might be a more suitable name, but this
+                        was chosen for consistency with NormalizeComponent.
+ */
+class BatchNormComponent: public Component {
+ public:
+
+  BatchNormComponent() { }
+
+  // call this with 'true' to set 'test mode' where the batch normalization is
+  // done with stored stats.  There won't normally be any need to specially
+  // accumulate these stats; they are stored as a matter of course on each
+  // iteration of training, as for NonlinearComponents, and we'll use the stats
+  // from the most recent [script-level] iteration.
+  void SetTestMode(bool test_mode);
+
+  // constructor using another component
+  BatchNormComponent(const BatchNormComponent &other);
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "BatchNormComponent"; }
+  virtual int32 Properties() const {
+    // If the block-dim is less than the dim, we need the input and output
+    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
+    // internally.  This is not much of a cost, because this will be used
+    // in convnets where we have to do this anyway.
+    return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|
+        kBackpropInPlace|
+        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
+        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
+  }
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &in_value,
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *, // to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new BatchNormComponent(*this); }
+
+  virtual void Scale(BaseFloat scale);
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void ZeroStats();
+
+
+  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
+
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
+                          const CuMatrixBase<BaseFloat> &out_value,
+                          void *memo);
+
+  // Members specific to this component type.
+  // Note: the offset and scale will only be nonempty in 'test mode'.
+  const CuVector<BaseFloat> &Offset() const { return offset_; }
+  const CuVector<BaseFloat> &Scale() const { return scale_; }
+
+ private:
+
+  struct Memo {
+    // number of frames (after any reshaping).
+    int32 num_frames;
+    // 'sum_sumsq_scale' is of dimension 4 by block_dim_:
+    // Row 0 = mean = the mean of the rows of the input
+    // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
+    // Row 2 = scale = the scale of the renormalization, which is
+    // Row 3 is used as a temporary in Backprop.
+    //    the inverse stddev of the input (modified by epsilon_,
+    //    see the Propagate function.
+    CuMatrix<BaseFloat> mean_uvar_scale;
+  };
+
+  void Check() const;
+
+  // this function is used in a couple of places; it turns the raw stats into
+  // the offset/scale term of a normalizing transform.
+  static void ComputeOffsetAndScale(double count,
+                                    BaseFloat epsilon,
+                                    const Vector<double> &stats_sum,
+                                    const Vector<double> &stats_sumsq,
+                                    Vector<BaseFloat> *offset,
+                                    Vector<BaseFloat> *scale);
+  // computes derived parameters offset_ and scale_.
+  void ComputeDerived();
+
+  // Dimension of the input and output.
+  int32 dim_;
+  // This would normally be the same as dim_, but if it's less (and it must be >
+  // 0 and must divide dim_), then each separate block of the input of dimension
+  // 'block_dim_' is treated like a separate frame for the purposes of
+  // normalization.  This can be used to implement spatial batch normalization
+  // for convolutional setups-- assuming the filter-dim has stride 1, which it
+  // always will in the new code in nnet-convolutional-component.h.
+  int32 block_dim_;
+
+  // Used to avoid exact-zero variances, epsilon has the dimension of a
+  // covariance.
+  BaseFloat epsilon_;
+
+  // This value will normally be 1.0, which is the default, but you can set it
+  // to other values as a way to control how fast the following layer learns
+  // (smaller -> slower).  The same config exists in NormalizeComponent.
+  BaseFloat target_rms_;
+
+  // This is true if we want the batch normalization to operate in 'test mode'
+  // meaning the data mean and stddev used for the normalization are fixed
+  // quantities based on previously accumulated stats.  Note: the stats we use
+  // for this are based on the same 'StoreStats' mechanism as we use for
+  // components like SigmoidComponent and ReluComponent; we'll be using
+  // the stats from the most recent [script-level] iteration of training.
+  bool test_mode_;
+
+
+  // total count of stats stored by StoreStats().
+  double count_;
+  // sum-of-data component of stats of input data.
+  CuVector<double> stats_sum_;
+  // sum-of-squared component of stats of input data.
+  CuVector<double> stats_sumsq_;
+
+  // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they
+  // dictate the transform that is done in 'test mode'.  They are set only when
+  // reading the model from disk and when calling SetTestMode(true); they are
+  // resized to empty when the stats are updated, to ensure that out-of-date
+  // values are not kept around.
+  CuVector<BaseFloat> offset_;
+  CuVector<BaseFloat> scale_;
+};
+
+
+/*
+  MemoryNormComponent
+
+  MemoryNormComponent is like batch normalization, except the stats
+  are accumulated as a weighted sum over past minibatches (if this is
+  not the first minibatch), instead of over the current minibatch.
+
+  You can use it in the same way you would normally use BatchNormComponent.
+
+  Accepted configuration values:
+         dim          Dimension of the input and output
+         block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+                      of 'dim'.  In this case, each block of dimension 'block-dim'
+                      is treated like a separate row of the input matrix, which
+                      means that the stats from n'th element of each
+                      block are pooled into one class, for each n.a
+         epsilon      Small term added to the variance that is used to prevent
+                      division by zero
+         target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
+                      it will normalize the standard deviation of the output to
+                      2.0. 'target-stddev' might be a more suitable name, but this
+                      was chosen for consistency with NormalizeComponent.
+         include-indirect-derivative  This defaults to true, which means we
+                      include the (smaller) derivative term that comes via the
+                      mean and variance estimation.  You might want to set this to
+                      false for testing purposes.
+ */
+class MemoryNormComponent: public Component {
+ public:
+
+  MemoryNormComponent() { }
+
+  // constructor using another component
+  MemoryNormComponent(const MemoryNormComponent &other);
+
+  virtual int32 InputDim() const { return dim_; }
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+  virtual void InitFromConfig(ConfigLine *cfl);
+  virtual std::string Type() const { return "MemoryNormComponent"; }
+  virtual int32 Properties() const {
+    // If the block-dim is less than the dim, we need the input and output
+    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
+    // internally.  This is not much of a cost, because this will be used
+    // in convnets where we have to do this anyway.
+    bool iid = include_indirect_derivative_;
+    return kSimpleComponent|kPropagateInPlace|kBackpropInPlace|
+        (test_mode_ ? 0 : kUsesMemo|kStoresStats|(iid?kBackpropNeedsOutput:0))|
+        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0);
+
+  }
+
+  // Call this function to set 'test mode' to true or false.  In test
+  // mode the stats are frozen and will not be updated.
+  void SetTestMode(bool test_mode);
+
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                         const CuMatrixBase<BaseFloat> &in,
+                         CuMatrixBase<BaseFloat> *out) const;
+
+  /// The backprop function.  In addition to propagating the input back to
+  /// 'in_deriv', if supplied, this function also updates, in 'to_update',
+  /// backward_count_ and the rows named 'y_deriv' and 'y_deriv_y' of
+  /// data_, and also the derived quantities 'x_deriv' and 'scale_deriv'
+  /// of data_.
+  /// (note: in training, 'to_update' will point to delta_nnet_, and later these
+  /// stats get added to nnet_ via Add())
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &out_value,
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void Read(std::istream &is, bool binary); // This Read function
+  // requires that the Component has the correct type.
+
+  /// Write component to stream
+  virtual void Write(std::ostream &os, bool binary) const;
+  virtual Component* Copy() const { return new MemoryNormComponent(*this); }
+
+  // Note: if you scale by a negative number it will set stats to zero
+  // rather than allow a negative stats count.
+  virtual void Scale(BaseFloat scale);
+  // Note: if you try to add with negative coefficient (as in backstitch), it
+  // will do nothing.
+  virtual void Add(BaseFloat alpha, const Component &other);
+  virtual void ZeroStats();
+
+  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
+
+  /// This function updates stats_count_, the rows named 'x_mean', 'x_uvar'
+  /// of data_, and also the derived quantities stored in the rows named
+  /// 'scale', 'x_deriv' and 'scale_deriv' of data_.
+  /// (note: in training, this is called on the delta_nnet_, and later
+  /// the stats get added to nnet_ via Add())
+  virtual void StoreStats(const CuMatrixBase<BaseFloat> &, // in_value
+                          const CuMatrixBase<BaseFloat> &, // out_value
+                          void *memo);
+
+ private:
+
+  struct Memo {
+    // The number of frames (after any reshaping; so in general it will
+    // be the original NumRows() of the matrix, times dim_ / block_dim_).
+    int32 num_frames;
+    // 'data' is of dimension 5 by block_dim_.
+    // Row 0, which we'll call 'x_sum', is the sum of the rows of the
+    //  input data.
+    // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the
+    //   elementwise square of the input data matrix.
+    // Row 2,3,4 are 'scale', 'x_deriv', 'scale_deriv', which
+    //         are just copies of the corresponding values in
+    //         MemoryNormComponent::data_ (from the const nnet, the one we're
+    //         training), and which will have been copied from there when this
+    //         object was created.  However if stats_count_ was <= 0 when this
+    //         object was created (first minibatch), then 'scale'
+    //         will be set to the mean and inverse-stddev implied by the stats
+    //         'sum' and 'sumsq', and 'x_deriv' and 'scale_deriv' will be zero.
+    //         This is so that it does something sensible on the very first
+    //         minibatch we train.  The reason why we copy these quantities here
+    //         is because in the backprop phase we feel it would be better to
+    //         use the same values that were used in the forward propagation,
+    //         instead of the possibly-updated values that might exist when
+    //         Backprop() is called.  It's actually not clear whether this is
+    //         necessary.
+    CuMatrix<BaseFloat> data;
+
+    // This is set to true if we have the 'indirect' terms in the derivative,
+    // relating to the 'x_deriv' and 'scale_deriv' terms in 'data'.  If false,
+    // we save some computation.
+    bool has_indirect_terms;
+  };
+
+
+  /// This piece of code, which has been broken out from Propagate(), computes
+  /// the memo.  Expects in.NumCols() == block_dim_.  It should only be called
+  /// if test_mode_ is false.
+  Memo *GetMemo(const CuMatrixBase<BaseFloat> &in) const;
+
+  /// This function computes certain members of data_ that are derived:
+  /// specifically, rows 4, 5 and 6, which are called 'scale', 'x_deriv' and
+  /// 'scale_deriv'.
+  void ComputeDerived();
+
+  void Check() const;
+
+  // this function is used in a couple of places; it turns the raw stats into
+  // the offset/scale term of a normalizing transform.
+  static void ComputeOffsetAndScale(BaseFloat count,
+                                    BaseFloat epsilon,
+                                    const Vector<BaseFloat> &stats_sum,
+                                    const Vector<BaseFloat> &stats_sumsq,
+                                    Vector<BaseFloat> *offset,
+                                    Vector<BaseFloat> *scale);
+
+  // Dimension of the input and output.
+  int32 dim_;
+
+  // block_dim_ would normally be the same as dim_, but if it's less (and it
+  // must be > 0 and must divide dim_), then each separate block of the input of
+  // dimension 'block_dim_' is treated like a separate frame for the purposes of
+  // normalization.  This can be used to implement spatial batch normalization
+  // for convolutional setups-- assuming the filter-dim has stride 1, which it
+  // always will in the new code in nnet-convolutional-component.h.
+  int32 block_dim_;
+
+  // Used to avoid exact-zero variances, epsilon has the dimension of a
+  // covariance.
+  BaseFloat epsilon_;
+
+  // This controls the dynamic range of the output.  At 1.0 which is the
+  // default, the output has unit standard deviation, but you can set it to
+  // other values.  The same config exists in NormalizeComponent.
+  BaseFloat target_rms_;
+
+  // If true, we include the smaller indirect part of the derivative, that comes
+  // via the stats estimation.  This is included mostly for testing purposes; we
+  // expect this will normally be true.
+  bool include_indirect_derivative_;
+
+  // If test_mode_ is set, no stats will be accumulated.  It's an error if
+  // test_mode_ is set and the data count is zero, and you try to propagate.
+  bool test_mode_;
+
+  // The total count of stats stored by StoreStats(), and which are represented
+  // in x_mean = data_.Row(0) and x_uvar = data_.Row(1).  We never allow this to
+  // become less than zero, even if people do unexpected things with Add() and
+  // Scale().
+  BaseFloat stats_count_;
+
+  // backward_count_ is the total count of stats accumulated during backprop,
+  // and represents the count correspondsing to the stats in 'y_deriv' and
+  // 'y_deriv_y'.  It is expected to be either zero or the same as stats_count_,
+  // in most circumstances, depending whether you were doing backprop or just
+  // inference-- but we don't enforce this because there may be situations where
+  // this is not the case.
+  //
+  // We never allow this to become less than zero, even if people do unexpected
+  // things with Add() and Scale().
+  BaseFloat backward_count_;
+
+  // We store data_ as a single matrix because it enables certain operations
+  // to be done using fewer kernels, but it contains various different quantities,
+  // which we'll describe below as if they were separate variables.
+  // data_ is of dimension 6 by block_dim_.
+  CuMatrix<BaseFloat> data_;
+  // data_.Row(0) is 'x_mean', which is the decaying moving-average of
+  //             input data x; or zero if stats_count_ is zero.
+  // data_.Row(1) is 'x_uvar', which is the decaying moving-average of
+  //             input data x^2 or zero if stats_count_ is zero.
+  // data_.Row(2) is 'y_deriv', which is the decaying moving-average
+  //           derivative of the objective w.r.t. the output y; or
+  //           zero if backward_count_ is zero.
+  // data_.Row(3) is 'y_deriv_y', which  the decaying moving average
+  //           of the product of the output times (the derivative of the
+  //           objective w.r.t. the output); or zero if backward_count_
+  //           is zero.
+  //
+  // The quantities below are derived from the stats above.
+  //
+  // data_.Row(4) is 'scale', which is the inverse square root of the
+  //            covariance computed from x_mean and x_uvar (plus epsilon),
+  //            or zero if stats_count_ is zero.
+  // data_.Row(5) is 'x_deriv', which is the negative of the average derivative
+  //           (per frame) of the objective function w.r.t the input x (just the
+  //           part that comes via the derivative w.r.t. the x mean).
+  //           'x_deriv' equals 'y_deriv' times 'scale'.
+  // data_.Row(6) is 'scale_deriv', which relates to the part of the
+  //           derivative w.r.t. the input that comes from the objf
+  //           derivative w.r.t. the scale.  It equals scale * y_deriv_y.
+};
+
+
+
+
+
+} // namespace nnet3
+} // namespace kaldi
+
+
+#endif
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 2c4da825013..6dd2873bd81 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -481,7 +481,7 @@ static void PrintFloatSuccinctly(std::ostream &os, BaseFloat f) {
 
 // Returns a string that summarizes a vector fairly succintly, for
 // printing stats in info lines.
-std::string SummarizeVector(const Vector<BaseFloat> &vec) {
+std::string SummarizeVector(const VectorBase<BaseFloat> &vec) {
   std::ostringstream os;
   if (vec.Dim() < 10) {
     os << "[ ";
@@ -517,6 +517,11 @@ std::string SummarizeVector(const Vector<BaseFloat> &vec) {
   return os.str();
 }
 
+std::string SummarizeVector(const CuVectorBase<BaseFloat> &cu_vec) {
+  Vector<BaseFloat> vec(cu_vec);
+  return SummarizeVector(vec);
+}
+
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuVectorBase<BaseFloat> &params,
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index fef21301ff6..7f1380bf253 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -191,7 +191,9 @@ std::string ErrorContext(const std::string &str);
 
 // Returns a string that summarizes a vector fairly succintly, for
 // printing stats in info lines.
-std::string SummarizeVector(const Vector<BaseFloat> &vec);
+std::string SummarizeVector(const VectorBase<BaseFloat> &vec);
+
+std::string SummarizeVector(const CuVectorBase<BaseFloat> &vec);
 
 /** Print to 'os' some information about the mean and standard deviation of
     some parameters, used in Info() functions in nnet-simple-component.cc.
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index d6c4e2163bf..e76f7cae2a7 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -313,179 +313,6 @@ void ElementwiseProductComponent::Write(std::ostream &os, bool binary) const {
   WriteToken(os, binary, "</ElementwiseProductComponent>");
 }
 
-const BaseFloat NormalizeComponent::kSquaredNormFloor =
-    pow(2.0, NormalizeComponent::kExpSquaredNormFloor);
-
-NormalizeComponent::NormalizeComponent(const NormalizeComponent &other):
-    input_dim_(other.input_dim_), block_dim_(other.block_dim_),
-    target_rms_(other.target_rms_),
-    add_log_stddev_(other.add_log_stddev_) { }
-
-void NormalizeComponent::InitFromConfig(ConfigLine *cfl) {
-  input_dim_ = 0;
-  add_log_stddev_ = false;
-  target_rms_ = 1.0;
-  bool ok = cfl->GetValue("dim", &input_dim_) ||
-      cfl->GetValue("input-dim", &input_dim_);
-  block_dim_ = input_dim_;
-  cfl->GetValue("block-dim", &block_dim_);
-  cfl->GetValue("target-rms", &target_rms_);
-  cfl->GetValue("add-log-stddev", &add_log_stddev_);
-  if (!ok || cfl->HasUnusedValues() || input_dim_ <= 0 || target_rms_ <= 0.0 ||
-      block_dim_ <= 0 || input_dim_ % block_dim_ != 0)
-    KALDI_ERR << "Invalid initializer for layer of type "
-              << Type() << ": \"" << cfl->WholeLine() << "\"";
-}
-
-void NormalizeComponent::Read(std::istream &is, bool binary) {
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<NormalizeComponent>") {
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "<Dim>" || token == "<InputDim>");
-  ReadBasicType(is, binary, &input_dim_); // Read dimension.
-  ReadToken(is, binary, &token);
-  if (token == "<BlockDim>") {
-    ReadBasicType(is, binary, &block_dim_);
-    ReadToken(is, binary, &token);
-  } else {
-    block_dim_ = input_dim_;
-  }
-  // read target_rms_ if it is available.
-  if (token == "<TargetRms>") {
-    ReadBasicType(is, binary, &target_rms_);
-    ReadToken(is, binary, &token);
-  }
-  //  Read add_log_stddev_ token, if it is available.
-  if (token == "<AddLogStddev>") {
-    ReadBasicType(is, binary, &add_log_stddev_);
-    ReadToken(is, binary, &token);
-  }
-  if (token == "<ValueAvg>") {
-    // back-compatibility code.
-    CuVector<double> temp;
-    temp.Read(is, binary);
-    ExpectToken(is, binary, "<DerivAvg>");
-    temp.Read(is, binary);
-    ExpectToken(is, binary, "<Count>");
-    double count;
-    ReadBasicType(is, binary, &count);
-    ReadToken(is, binary, &token);
-  }
-  KALDI_ASSERT(token == "</NormalizeComponent>");
-}
-
-void NormalizeComponent::Write(std::ostream &os, bool binary) const {
-  WriteToken(os, binary, "<NormalizeComponent>");
-  WriteToken(os, binary, "<InputDim>");
-  WriteBasicType(os, binary, input_dim_);
-  if (block_dim_ != input_dim_) {
-    WriteToken(os, binary, "<BlockDim>");
-    WriteBasicType(os, binary, block_dim_);
-  }
-  WriteToken(os, binary, "<TargetRms>");
-  WriteBasicType(os, binary, target_rms_);
-  WriteToken(os, binary, "<AddLogStddev>");
-  WriteBasicType(os, binary, add_log_stddev_);
-  WriteToken(os, binary, "</NormalizeComponent>");
-}
-
-std::string NormalizeComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", input-dim=" << InputDim()
-         << ", output-dim=" << OutputDim() << ", target-rms=" << target_rms_
-         << ", add-log-stddev=" << std::boolalpha << add_log_stddev_;
-  if (block_dim_ != input_dim_)
-    stream << ", block-dim=" << block_dim_;
-  return stream.str();
-}
-
-// The output y_i = scale * x_i,
-// and we want to RMS value of the y_i to equal target_rms,
-// so y^t y = D * target_rms^2 (if y is one row of the input).
-// we need to have scale = 1.0 / sqrt(x^t x / (D * target_rms^2)).
-// there is also flooring involved, to avoid division-by-zero
-// problems.  It's important for the backprop, that the floor's
-// square root is exactly representable as float.
-// If add_log_stddev_ is true, log(max(epsi, sqrt(x^t x / D)))
-// is an extra dimension of the output.
-void* NormalizeComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                   const CuMatrixBase<BaseFloat> &in,
-                                   CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(in.NumCols() == InputDim() && out->NumCols() == OutputDim() &&
-               in.NumRows() == out->NumRows());
-  if (block_dim_ != input_dim_) {
-    int32 num_blocks = input_dim_ / block_dim_,
-        new_num_rows = in.NumRows() * num_blocks,
-        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
-    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
-    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_num_rows,
-                                       block_dim_, block_dim_),
-        out_reshaped(out->Data(), new_num_rows,
-                     output_block_dim, output_block_dim);
-    cu::NormalizePerRow(in_reshaped, target_rms_, add_log_stddev_,
-                        &out_reshaped);
-  } else {
-    cu::NormalizePerRow(in, target_rms_, add_log_stddev_, out);
-  }
-  return NULL;
-}
-
-/*
-  A note on the derivative of NormalizeComponent...
-  let both row_in and row_out be vectors of dimension D.
-  Let p = row_in^T row_in / (D * target_rms^2), and let
-  f = 1.0 / sqrt(max(kSquaredNormFloor, p)), and we compute row_out as:
-  row_out = f row_in.
-  Suppose we have a quantity deriv_out which is the derivative
-  of the objective function w.r.t. row_out.  We want to compute
-  deriv_in which is the derivative of the objective function w.r.t.
-  row_in.  Let the objective function be F.  One term is obvious: we have
-  deriv_in = f deriv_out + ....
-  next we have to take into account the derivative that gets back-propagated
-  through f.  Obviously, dF/df = deriv_out^T row_in.
-  And df/dp = (p <= kSquaredNormFloor ? 0.0 : -0.5 p^{-1.5}) = (f == 1.0 / sqrt(kSquaredNormFloor) ? 0.0 : -0.5 f^3),
-  and dp/d(row_in) = 2/(D * target_rms^2) row_in. [it's vector_valued].
-  So this term in dF/d(row_in) equals:
-  dF/df df/dp dp/d(row_in)   =    2/(D * target_rms^2) (f == 1.0 / sqrt(kSquaredNormFloor)  ? 0.0 : -0.5 f^3) (deriv_out^T row_in) row_in
-  So
-  deriv_in = f deriv_out + (f == 1.0 ? 0.0 : -f^3  / (D * target_rms^2) ) (deriv_out^T row_in) row_in
-
-  if add_log_stddev_ true, the deriv_in has another term as
-  dF/dx_i = dF/df . df/dx_i => df/dx_i = x_i/(x^T x)
-*/
-void NormalizeComponent::Backprop(const std::string &debug_info,
-                                  const ComponentPrecomputedIndexes *indexes,
-                                  const CuMatrixBase<BaseFloat> &in_value,
-                                  const CuMatrixBase<BaseFloat> &, // out_value
-                                  const CuMatrixBase<BaseFloat> &out_deriv,
-                                  void *memo,
-                                  Component *to_update,
-                                  CuMatrixBase<BaseFloat> *in_deriv) const {
-  if (!in_deriv)
-    return;
-  if (block_dim_ != input_dim_) {
-    int32 num_blocks = input_dim_ / block_dim_,
-        new_num_rows = in_value.NumRows() * num_blocks,
-        output_block_dim = block_dim_ + (add_log_stddev_ ? 1 : 0);
-    KALDI_ASSERT(in_value.Stride() == in_value.NumCols() &&
-                 out_deriv.Stride() == out_deriv.NumCols() &&
-                 in_deriv->Stride() == in_deriv->NumCols());
-    CuSubMatrix<BaseFloat> in_value_reshaped(in_value.Data(), new_num_rows,
-                                             block_dim_, block_dim_),
-        out_deriv_reshaped(out_deriv.Data(), new_num_rows,
-                           output_block_dim, output_block_dim),
-        in_deriv_reshaped(in_deriv->Data(), new_num_rows,
-                          block_dim_, block_dim_);
-    cu::DiffNormalizePerRow(in_value_reshaped, out_deriv_reshaped, target_rms_,
-                            add_log_stddev_, &in_deriv_reshaped);
-  } else {
-    cu::DiffNormalizePerRow(in_value, out_deriv, target_rms_, add_log_stddev_,
-                            in_deriv);
-  }
-}
-
 void* SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                  const CuMatrixBase<BaseFloat> &in,
                                  CuMatrixBase<BaseFloat> *out) const {
@@ -5880,489 +5707,6 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) {
   }
 }
 
-
-
-void BatchNormComponent::ComputeDerived() {
-  if (!test_mode_) {
-    offset_.Resize(0);
-    scale_.Resize(0);
-    return;
-  }
-
-  if (count_ == 0.0) {
-    KALDI_WARN << "Test-mode is set but there is no data count.  "
-        "Creating random counts.  This only makes sense "
-        "in unit-tests (or compute_prob_*.0.log).  If you see this "
-        "elsewhere, something is very wrong.";
-    count_ = 1.0;
-    stats_sum_.SetRandn();
-    stats_sumsq_.SetRandn();
-    stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
-  }
-
-  offset_.Resize(block_dim_);
-  scale_.Resize(block_dim_);
-  offset_.CopyFromVec(stats_sum_);
-  offset_.Scale(-1.0 / count_);
-  // now offset_ is -mean.
-  scale_.CopyFromVec(stats_sumsq_);
-  scale_.Scale(1.0 / count_);
-  scale_.AddVecVec(-1.0, offset_, offset_, 1.0);
-  // now scale_ is variance.
-  // Mathematically the ApplyFloor statement should be a no-op; this is in case
-  // of numerical roundoff.
-  scale_.ApplyFloor(0.0);
-  scale_.Add(epsilon_);
-  scale_.ApplyPow(-0.5);
-  // now scale_ = min(variance, epsilon)^{-0.5}.
-  // next, multiply by the target RMS (normally 1.0).
-  scale_.Scale(target_rms_);
-  offset_.MulElements(scale_);
-  // now offset_ is -(scale*mean).
-}
-
-void BatchNormComponent::SetTestMode(bool test_mode) {
-  test_mode_ = test_mode;
-  ComputeDerived();
-}
-
-void BatchNormComponent::Check() const {
-  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-               epsilon_ > 0.0 && target_rms_ > 0.0);
-}
-
-BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
-    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
-    target_rms_(other.target_rms_), test_mode_(other.test_mode_),
-    count_(other.count_), stats_sum_(other.stats_sum_),
-    stats_sumsq_(other.stats_sumsq_) {
-  ComputeDerived();
-  Check();
-}
-
-
-std::string BatchNormComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
-         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
-         << ", count=" << count_
-         << ", test-mode=" << (test_mode_ ? "true" : "false");
-  if (count_ > 0) {
-    Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
-    mean.Scale(1.0 / count_);
-    var.Scale(1.0 / count_);
-    // subtract mean^2 from var.
-    var.AddVecVec(-1.0, mean, mean, 1.0);
-    var.ApplyFloor(0.0);
-    var.ApplyPow(0.5);  // make it the stddev.
-    stream << ", data-mean=" << SummarizeVector(mean)
-           << ", data-stddev=" << SummarizeVector(var);
-  }
-  return stream.str();
-}
-
-void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
-  dim_ = -1;
-  block_dim_ = -1;
-  epsilon_ = 1.0e-03;
-  target_rms_ = 1.0;
-  test_mode_ = false;
-  bool ok = cfl->GetValue("dim", &dim_);
-  cfl->GetValue("block-dim", &block_dim_);
-  cfl->GetValue("epsilon", &epsilon_);
-  cfl->GetValue("target-rms", &target_rms_);
-  cfl->GetValue("test-mode", &test_mode_);
-  if (!ok || dim_ <= 0) {
-    KALDI_ERR << "BatchNormComponent must have 'dim' specified, and > 0";
-  }
-  if (block_dim_ == -1)
-    block_dim_ = dim_;
-  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-        epsilon_ > 0 && target_rms_ > 0))
-    KALDI_ERR << "Invalid configuration in BatchNormComponent.";
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  count_ = 0;
-  stats_sum_.Resize(block_dim_);
-  stats_sumsq_.Resize(block_dim_);
-  if (test_mode_) {
-    ComputeDerived();
-  }
-}
-
-
-
-/*
-  BATCH_NORM_MATH
-
-  This comment describes the equations involved in batch normalization, and
-  derives the forward and back-propagation.
-
-  This is all dimension-by-dimension, so we just imagine the inputs
-  are scalars x(i), for i=0 .. n-1.
-
-  FORWARD PASS:
-
-  Define xsum  = sum_i x(i)
-         x2sum = sum_i x(i)^2
-          mean = xsum / n
-           var = x2sum / n - (mean*mean)
-         scale = (var + epsilon)^{-0.5}
-        offset = -mean * scale
-
-      y(i) = scale * x(i) + offset
-
-   Most of the rest of this comment derives how to compute the derivatives.  If
-   you just want the formulas, please skip to the string 'BACKWARD PASS' below.
-
-  We'll use a notation where an apostrophe on something means (the derivative of
-  the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
-  We are given y'(i).  Propagating the derivatives backward:
-     offset' = sum_i y'(i)
-     scale' = (sum_i y'(i) * x(i)) - offset' * mean
-       var' = scale' * -0.5 * (var + epsilon)^{-1.5}
-            = -0.5 * scale' * scale^3
-      mean' = -offset' * scale - 2 * mean * var'
-      xsum' = mean' / n
-     x2sum' = var' / n
-
-  So the derivatives propagated back to the original data are:
-     x'(i) = y'(i) * scale  +  xsum'  +  x(i) * x2sum'
-
-  The above is quite complicated to compute, but we can use some invariances
-  to work out a simpler way to compute the derivatives.
-
-  Firstly, note that x'(i) is of the form:
-
-   x'(i) =  y'(i) * scale + [affine function of x(i)].
-
-   [it's a 1-d affine function, i.e. offset and scale].
- This has the same functional form as:
-
-  x'(i) =  y'(i) * scale + [affine function of y(i)].
-
-  since y(i) is an affine function of x(i) with nonzero scale.
-  Because the output is invariant to shifts in the input, sum_i x'(i)
-  will be zero.  This is sufficient to determine the bias
-  term in the affine function.  [Note: the scale on y(i) doesn't
-  come into it because the y(i) sum to zero].  The offset
-  will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero.
-  So let's write it as
-
-    x'(i) =  (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i).
-
-  and it will be convenient to define:
-
-  x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-
-  which is just y'(i) with mean subtraction, scaled according to
-  the scale used in the normalization.  So write
-
-   x'(i) = x_deriv_base(i) + alpha y(i).
-
- The question is, what is the scale alpha.  We don't actually need to
- do any differentiation to figure this out.  First, assume there is
- no "+ epsilon" in the variance; later we'll explain why this doesn't
- matter.  The key to working out alpha is that the output is invariant
- to scaling of the input.  Assume we scale around the input's mean,
- since that makes the math simpler.  We can express this by the
- constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0.  This is
- equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since
- y(i) is x(i) - avg-x times a nonzero scale.  We'll use this contraint
- to determine alpha, Using the above expressionfor x(i), we can write
- this constraint as:
-   \sum_i ( y(i) x_deriv_base(i)  + alpha y(i) y(i)) = 0.
- Now, since we said we'd ignore the epsilon, the output has unit variance,
- so we know that \sum_i y(i) y(i) = n.
- So alpha = - \sum_i y(i) x_deriv_base(i) / n.  We can actually re-imagine
- the epsilon term (or variance-flooring) as having been implemented by
- adding a couple extra rows to the matrix with suitable values, and zero
- output-deriv for those rows.  If you think about it carefully you'll see that
- the formula above is valid even if there is an extra term
- in the variance.  Anyway the correctness of the derivative will get tested
- throughly by the component unit-tests.
-
- So to recap, here is the backprop.
-
- BACKWARD PASS:
-
-  We are given y'(i), scale, and y(i).
-
-  We compute:
-    x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-              alpha = - \sum_i y(i) x_deriv_base(i) / n
-              x'(i) = x_deriv_base(i) + alpha y(i)
-  */
-
-
-
-void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                    const CuMatrixBase<BaseFloat> &in,
-                                    CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(SameDim(in, *out) &&
-               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
-  if (in.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
-    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
-        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
-        new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
-        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
-    return Propagate(indexes, in_reshaped, &out_reshaped);
-  }
-
-  // From this point, we can assume that the num-cols of 'in' and 'out'
-  // equals block_dim_.
-
-  if (!test_mode_) {
-    // search in the comment above for FORWARD PASS to see what is being
-    // implemented here.
-    // if this takes too much time due to multiple different CUDA calls,
-    // we'll consider making a single kernel for some of it.
-    Memo *memo = new Memo;
-    int32 num_frames = in.NumRows(), dim = block_dim_;
-    memo->num_frames = num_frames;
-    memo->mean_uvar_scale.Resize(4, dim);
-    CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
-        uvar(memo->mean_uvar_scale, 1),
-        scale(memo->mean_uvar_scale, 2);
-    mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
-    uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
-    scale.CopyFromVec(uvar);
-    // by applying this scale at this point, we save a multiply later on.
-    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
-    scale.AddVecVec(-var_scale, mean, mean, var_scale);
-    // at this point, 'scale' contains just the variance [divided by target-rms^2].
-    scale.ApplyFloor(0.0);
-    scale.Add(var_scale * epsilon_);
-    // Now 'scale' contains the variance floored to zero and then with epsilon
-    // added [both divided by target-rms^2].
-    scale.ApplyPow(-0.5);
-    // now 'scale' is the actual scale we'll use.
-
-    // the next command will do no work if out == in, for in-place propagation.
-    out->CopyFromMat(in);
-    out->AddVecToRows(-1.0, mean, 1.0);
-    out->MulColsVec(scale);
-    return static_cast<void*>(memo);
-  } else {
-    if (offset_.Dim() != block_dim_) {
-      if (count_ == 0)
-        KALDI_ERR << "Test mode set in BatchNormComponent, but no stats.";
-      else  // why was ComputeDerived() not called?
-        KALDI_ERR << "Code error in BatchNormComponent";
-    }
-    out->CopyFromMat(in);
-    out->MulColsVec(scale_);
-    out->AddVecToRows(1.0, offset_, 1.0);
-    return NULL;
-  }
-}
-
-void BatchNormComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,  // unused
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo_in,
-    Component *to_update,  // unused
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  KALDI_ASSERT(SameDim(out_value, out_deriv) &&
-               SameDim(out_value, *in_deriv) &&
-               (out_value.NumCols() == dim_ ||
-                out_value.NumCols() == block_dim_));
-  if (out_value.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(out_value.Stride() == out_value.NumCols() &&
-                 out_deriv.Stride() == out_deriv.NumCols() &&
-                 in_deriv->Stride() == in_deriv->NumCols());
-    int32 ratio = dim_ / block_dim_,
-        orig_rows = out_value.NumRows(),
-        orig_cols = out_value.NumCols(),
-        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
-                                              new_cols, new_cols),
-        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
-        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
-    // we'll never use in_value, so pass it in unchanged.
-    Backprop(debug_info, indexes, in_value,
-             out_value_reshaped, out_deriv_reshaped,
-             memo_in, to_update, &in_deriv_reshaped);
-    return;
-  }
-
-  Memo *memo = static_cast<Memo*>(memo_in);
-
-  if (!test_mode_) {
-    // search above for BACKWARD PASS for a comment describing the math.
-    KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
-    int32 num_frames = memo->num_frames;
-    KALDI_ASSERT(out_value.NumRows() == num_frames);
-    CuSubVector<BaseFloat> temp(memo->mean_uvar_scale, 3),
-        scale(memo->mean_uvar_scale, 2);
-    temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
-    // the following does no work if in_deriv and out_deriv are the same matrix.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->AddVecToRows(1.0, temp);
-    in_deriv->MulColsVec(scale);
-    // at this point, 'in_deriv' contains:
-    // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-    temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_),
-                       out_value, kTrans, *in_deriv, kNoTrans, 0.0);
-    // now, 'temp' contains the quantity which we described
-    // in the math as:
-    // alpha = - \sum_i y(i) x_deriv_base(i) / n.
-    // The factor 1 / (target_rms_ * target_rms_) comes from following
-    // this additional scaling factor through the math.  In the comment I said
-    // "we know that \sum_i y(i) y(i) = n".  Taking target-rms into account
-    // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2".
-    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0);
-    // At this point, in_deriv contains  x'(i) = x_deriv_base(i) + alpha y(i).
-
-  } else {
-    KALDI_ASSERT(offset_.Dim() == block_dim_);
-    // the next call does no work if they point to the same memory.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->MulColsVec(scale_);
-  }
-}
-
-void BatchNormComponent::StoreStats(
-    const CuMatrixBase<BaseFloat> &in_value,
-    const CuMatrixBase<BaseFloat> &out_value,
-    void *memo_in) {
-  // in test mode this component does not store stats, it doesn't provide the
-  // kStoresStats flag.
-  KALDI_ASSERT(!test_mode_);
-  KALDI_ASSERT(out_value.NumCols() == dim_ || out_value.NumCols() == block_dim_);
-  if (out_value.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
-    int32 ratio = dim_ / block_dim_,
-        orig_rows = out_value.NumRows(),
-        orig_cols = out_value.NumCols(),
-        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
-                                              new_cols, new_cols);
-    // we'll never use in_value, so just pass it in unchanged.
-    StoreStats(in_value, out_value_reshaped, memo_in);
-    return;
-  }
-
-  Memo *memo = static_cast<Memo*>(memo_in);
-  KALDI_ASSERT(out_value.NumRows() == memo->num_frames);
-
-  CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
-      uvar(memo->mean_uvar_scale, 1);
-  KALDI_ASSERT(mean.Dim() == block_dim_ && memo->num_frames > 0);
-  BaseFloat num_frames = memo->num_frames;
-  if (stats_sum_.Dim() != block_dim_) {
-    stats_sum_.Resize(block_dim_);
-    stats_sumsq_.Resize(block_dim_);
-    KALDI_ASSERT(count_ == 0);
-  }
-  count_ += num_frames;
-  stats_sum_.AddVec(num_frames, mean, 1.0);
-  stats_sumsq_.AddVec(num_frames, uvar, 1.0);
-}
-
-void BatchNormComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<BatchNormComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<BlockDim>");
-  ReadBasicType(is, binary, &block_dim_);
-  ExpectToken(is, binary, "<Epsilon>");
-  ReadBasicType(is, binary, &epsilon_);
-  ExpectToken(is, binary, "<TargetRms>");
-  ReadBasicType(is, binary, &target_rms_);
-  ExpectToken(is, binary, "<TestMode>");
-  ReadBasicType(is, binary, &test_mode_);
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
-  ExpectToken(is, binary, "<StatsMean>");
-  stats_sum_.Read(is, binary);
-  ExpectToken(is, binary, "<StatsVar>");
-  stats_sumsq_.Read(is, binary);
-  stats_sumsq_.AddVecVec(1.0, stats_sum_, stats_sum_, 1.0);
-  stats_sum_.Scale(count_);
-  stats_sumsq_.Scale(count_);
-  ExpectToken(is, binary, "</BatchNormComponent>");
-  ComputeDerived();
-  Check();
-}
-
-void BatchNormComponent::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<BatchNormComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<BlockDim>");
-  WriteBasicType(os, binary, block_dim_);
-  WriteToken(os, binary, "<Epsilon>");
-  WriteBasicType(os, binary, epsilon_);
-  WriteToken(os, binary, "<TargetRms>");
-  WriteBasicType(os, binary, target_rms_);
-  WriteToken(os, binary, "<TestMode>");
-  WriteBasicType(os, binary, test_mode_);
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary,  count_);
-  CuVector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
-  if (count_ != 0) {
-    mean.Scale(1.0 / count_);
-    var.Scale(1.0 / count_);
-    var.AddVecVec(-1.0, mean, mean, 1.0);
-  }
-  WriteToken(os, binary, "<StatsMean>");
-  mean.Write(os, binary);
-  WriteToken(os, binary, "<StatsVar>");
-  var.Write(os, binary);
-  WriteToken(os, binary, "</BatchNormComponent>");
-}
-
-void BatchNormComponent::Scale(BaseFloat scale) {
-  if (scale == 0) {
-    count_ = 0.0;
-    stats_sum_.SetZero();
-    stats_sumsq_.SetZero();
-  } else {
-    count_ *= scale;
-    stats_sum_.Scale(scale);
-    stats_sumsq_.Scale(scale);
-  }
-}
-
-
-void BatchNormComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const BatchNormComponent *other =
-      dynamic_cast<const BatchNormComponent*>(&other_in);
-  count_ += alpha * other->count_;
-  stats_sum_.AddVec(alpha, other->stats_sum_);
-  stats_sumsq_.AddVec(alpha, other->stats_sumsq_);
-  // this operation might change offset_ and scale_, so we recompute them
-  // in this instance (but not in Scale()).
-  ComputeDerived();
-}
-
-void BatchNormComponent::ZeroStats() {
-  // We only zero the stats if we're not in test mode.  In test mode, this would
-  // be dangerous as the stats are the source for the transform, and zeroing
-  // them and then calling ComputeDerived() again would remove the transform
-  // parameters (offset_ and scale_).
-  if (!test_mode_) {
-    count_ = 0.0;
-    stats_sum_.SetZero();
-    stats_sumsq_.SetZero();
-  }
-}
-
-
 SumBlockComponent::SumBlockComponent(const SumBlockComponent &other):
     input_dim_(other.input_dim_), output_dim_(other.output_dim_),
     scale_(other.scale_) { }
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index d7cece06284..2e0965d25e9 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -40,6 +40,9 @@ namespace nnet3 {
 ///   output for one input, and return the kSimpleComponent flag in their
 ///   Properties(): for example, tanh and affine components.  In
 ///   nnet-general-component.h there are components that don't fit this pattern.
+///
+///   Some components that do provide the kSimpleComponent flag are not declared
+///   here: see also nnet-normalize-component.h.
 
 // This "nnet3" version of the p-norm component only supports the 2-norm.
 class PnormComponent: public Component {
@@ -186,82 +189,6 @@ class ElementwiseProductComponent: public Component {
   int32 output_dim_;
 };
 
-/*
-   Implements the function:
-
-         y = x * (sqrt(dim(x)) * target-rms) / |x|
-
-    where |x| is the 2-norm of the vector x.  I.e. its output is its input
-    scaled such that the root-mean-square values of its elements equals
-    target-rms.  (As a special case, if the input is zero, it outputs zero).
-
-    Note: if you specify add-log-stddev=true, it adds an extra element to
-     y which equals log(|x| / sqrt(dim(x))).
-
-
-   Configuration values accepted:
-      dim, or input-dim    Input dimension of this component, e.g. 1024.
-                           Will be the same as the output dimension if add-log-stddev=false.
-      block-dim            Defaults to 'dim' you may specify a nonzero divisor
-                           of 'dim'.  In this case the input dimension will
-                           be interpreted as blocks of dimension 'block-dim'
-                           to which the nonlinearity described above is applied
-                           separately.
-      add-log-stddev       You can set this to true to add an extra output
-                           dimension which will equal |x| / sqrt(dim(x)).
-                           If block-dim is specified, this is done per block.
-      target-rms           This defaults to 1.0, but if set it to another
-                           (nonzero) value, the output will be scaled by this
-                           factor.
- */
-class NormalizeComponent: public Component {
- public:
-  explicit NormalizeComponent(const NormalizeComponent &other);
-
-  virtual int32 Properties() const {
-    return kSimpleComponent|kBackpropNeedsInput|kBackpropAdds|
-        (add_log_stddev_ ? 0 : kPropagateInPlace|kBackpropInPlace) |
-        (block_dim_ != input_dim_ ? kInputContiguous|kOutputContiguous : 0);
-  }
-  NormalizeComponent() { }
-  virtual std::string Type() const { return "NormalizeComponent"; }
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual Component* Copy() const { return new NormalizeComponent(*this); }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                          const CuMatrixBase<BaseFloat> &in,
-                          CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &, // out_value
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary);
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual int32 InputDim() const { return input_dim_; }
-  virtual int32 OutputDim() const {
-    return (input_dim_ + (add_log_stddev_ ? (input_dim_ / block_dim_) : 0));
-  }
-  virtual std::string Info() const;
- private:
-  NormalizeComponent &operator = (const NormalizeComponent &other); // Disallow.
-  enum { kExpSquaredNormFloor = -66 };
-  // kSquaredNormFloor is about 0.7e-20.  We need a value that's exactly representable in
-  // float and whose inverse square root is also exactly representable
-  // in float (hence, an even power of two).
-  static const BaseFloat kSquaredNormFloor;
-  int32 input_dim_;
-  int32 block_dim_;
-  BaseFloat target_rms_; // The target rms for outputs, default 1.0.
-
-  bool add_log_stddev_; // If true, log(max(epsi, sqrt(row_in^T row_in / D)))
-                        // is an extra dimension of the output.
-};
-
-
 /*
    Implements the sigmoid nonlinearity, i.e. the function y = exp(-x).
 
@@ -2384,183 +2311,6 @@ class MaxpoolingComponent: public Component {
 };
 
 
-/*
-  BatchNormComponent
-
-  This implements batch normalization; for each dimension of the
-  input it normalizes the data to be zero-mean, unit-variance.  You
-  can set the block-dim configuration value to implement spatial
-  batch normalization, see the comment for the variable.
-
-  If you want to combine this with the trainable offset and scale that the
-  original BatchNorm paper used, then follow this by the
-  ScaleAndOffsetComponent.
-
-  It's a simple component (uses the kSimpleComponent flag), but it is unusual in
-  that it will give different results if you call it on half the matrix at a
-  time.  Most of the time this would be pretty harmless, so we still return the
-  kSimpleComponent flag.  We may have to modify the test code a little to
-  account for this, or possibly remove the kSimpleComponent flag.  In some sense
-  each output Index depends on every input Index, but putting those dependencies
-  explicitly into the dependency-tracking framework as a GeneralComponent
-  would be very impractical and might lead to a lot of unnecessary things being
-  computed.  You have to be a bit careful where you put this component, and understand
-  what you're doing e.g. putting it in the path of a recurrence is a bit problematic
-  if the minibatch size is small.
-
-    Accepted configuration values:
-           dim          Dimension of the input and output
-           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
-                        of 'dim'.  In this case, each block of dimension 'block-dim'
-                        is treated like a separate row of the input matrix, which
-                        means that the stats from n'th element of each
-                        block are pooled into one class, for each n.a
-           epsilon      Small term added to the variance that is used to prevent
-                        division by zero
-           target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
-                        it will normalize the standard deviation of the output to
-                        2.0. 'target-stddev' might be a more suitable name, but this
-                        was chosen for consistency with NormalizeComponent.
- */
-class BatchNormComponent: public Component {
- public:
-
-  BatchNormComponent() { }
-
-  // call this with 'true' to set 'test mode' where the batch normalization is
-  // done with stored stats.  There won't normally be any need to specially
-  // accumulate these stats; they are stored as a matter of course on each
-  // iteration of training, as for NonlinearComponents, and we'll use the stats
-  // from the most recent [script-level] iteration.
-  void SetTestMode(bool test_mode);
-
-  // constructor using another component
-  BatchNormComponent(const BatchNormComponent &other);
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "BatchNormComponent"; }
-  virtual int32 Properties() const {
-    // If the block-dim is less than the dim, we need the input and output
-    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
-    // internally.  This is not much of a cost, because this will be used
-    // in convnets where we have to do this anyway.
-    return kSimpleComponent|kBackpropNeedsOutput|kPropagateInPlace|
-        kBackpropInPlace|
-        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0)|
-        (test_mode_ ? 0 : kUsesMemo|kStoresStats);
-  }
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &in_value,
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *, // to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const { return new BatchNormComponent(*this); }
-
-  virtual void Scale(BaseFloat scale);
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void ZeroStats();
-
-
-  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
-
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &in_value,
-                          const CuMatrixBase<BaseFloat> &out_value,
-                          void *memo);
-
-  // Members specific to this component type.
-  // Note: the offset and scale will only be nonempty in 'test mode'.
-  const CuVector<BaseFloat> &Offset() const { return offset_; }
-  const CuVector<BaseFloat> &Scale() const { return scale_; }
-
- private:
-
-  struct Memo {
-    // number of frames (after any reshaping).
-    int32 num_frames;
-    // 'sum_sumsq_scale' is of dimension 4 by block_dim_:
-    // Row 0 = mean = the mean of the rows of the input
-    // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
-    // Row 2 = scale = the scale of the renormalization, which is
-    // Row 3 is used as a temporary in Backprop.
-    //    the inverse stddev of the input (modified by epsilon_,
-    //    see the Propagate function.
-    CuMatrix<BaseFloat> mean_uvar_scale;
-  };
-
-  void Check() const;
-
-  // this function is used in a couple of places; it turns the raw stats into
-  // the offset/scale term of a normalizing transform.
-  static void ComputeOffsetAndScale(double count,
-                                    BaseFloat epsilon,
-                                    const Vector<double> &stats_sum,
-                                    const Vector<double> &stats_sumsq,
-                                    Vector<BaseFloat> *offset,
-                                    Vector<BaseFloat> *scale);
-  // computes derived parameters offset_ and scale_.
-  void ComputeDerived();
-
-  // Dimension of the input and output.
-  int32 dim_;
-  // This would normally be the same as dim_, but if it's less (and it must be >
-  // 0 and must divide dim_), then each separate block of the input of dimension
-  // 'block_dim_' is treated like a separate frame for the purposes of
-  // normalization.  This can be used to implement spatial batch normalization
-  // for convolutional setups-- assuming the filter-dim has stride 1, which it
-  // always will in the new code in nnet-convolutional-component.h.
-  int32 block_dim_;
-
-  // Used to avoid exact-zero variances, epsilon has the dimension of a
-  // covariance.
-  BaseFloat epsilon_;
-
-  // This value will normally be 1.0, which is the default, but you can set it
-  // to other values as a way to control how fast the following layer learns
-  // (smaller -> slower).  The same config exists in NormalizeComponent.
-  BaseFloat target_rms_;
-
-  // This is true if we want the batch normalization to operate in 'test mode'
-  // meaning the data mean and stddev used for the normalization are fixed
-  // quantities based on previously accumulated stats.  Note: the stats we use
-  // for this are based on the same 'StoreStats' mechanism as we use for
-  // components like SigmoidComponent and ReluComponent; we'll be using
-  // the stats from the most recent [script-level] iteration of training.
-  bool test_mode_;
-
-
-  // total count of stats stored by StoreStats().
-  double count_;
-  // sum-of-data component of stats of input data.
-  CuVector<double> stats_sum_;
-  // sum-of-squared component of stats of input data.
-  CuVector<double> stats_sumsq_;
-
-  // offset_ and scale_ are derived from stats_sum_ and stats_sumsq_; they
-  // dictate the transform that is done in 'test mode'.  They are set only when
-  // reading the model from disk and when calling SetTestMode(true); they are
-  // resized to empty when the stats are updated, to ensure that out-of-date
-  // values are not kept around.
-  CuVector<BaseFloat> offset_;
-  CuVector<BaseFloat> scale_;
-};
-
-
 /**
    CompositeComponent is a component representing a sequence of
    [simple] components.  The config line would be something like the following
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 83b902a9b90..6ed0b6f9191 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1711,6 +1711,16 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " learning-rate=" << learning_rate;
       break;
     }
+      /*    case 35: {
+      *component_type = "MemoryNormComponent";
+      int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2);
+      os << " dim=" << dim
+         << " block-dim=" << block_dim << " target-rms="
+         << RandInt(1, 2) << " include-indirect-derivative="
+         << (RandInt(0, 1) == 0 ? "true" : "false")
+         << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
+      break;
+      }*/
     default:
       KALDI_ERR << "Error generating random component";
   }
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 64fc3003609..b000938d513 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -22,6 +22,7 @@
 #include "nnet3/nnet-utils.h"
 #include "nnet3/nnet-graph.h"
 #include "nnet3/nnet-simple-component.h"
+#include "nnet3/nnet-normalize-component.h"
 #include "nnet3/nnet-general-component.h"
 #include "nnet3/nnet-convolutional-component.h"
 #include "nnet3/nnet-parse.h"

From 922fc902dd051ba101cb999a491366c8ec8b6cd0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 10 Dec 2017 17:42:20 -0800
Subject: [PATCH 006/184] [src] Making MemoryNormComponent behave as
 BatchNormComponent in ScaleBatchnormStats, etc.

---
 src/nnet3/nnet-utils.cc | 11 ++++++++---
 src/nnet3/nnet-utils.h  |  9 +++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index b000938d513..7ae6bb99f09 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -492,9 +492,8 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
 bool HasBatchnorm(const Nnet &nnet) {
   for (int32 c = 0; c < nnet.NumComponents(); c++) {
     const Component *comp = nnet.GetComponent(c);
-    const BatchNormComponent *bc =
-        dynamic_cast<const BatchNormComponent*>(comp);
-    if (bc != NULL)
+    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL ||
+        dynamic_cast<const MemoryNormComponent*>(comp) != NULL)
       return true;
   }
   return false;
@@ -510,6 +509,9 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->Scale(batchnorm_stats_scale);
+    MemoryNormComponent *mc = dynamic_cast<MemoryNormComponent*>(comp);
+    if (mc != NULL)
+      mc->Scale(batchnorm_stats_scale);
   }
 }
 
@@ -534,6 +536,9 @@ void SetBatchnormTestMode(bool test_mode,  Nnet *nnet) {
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->SetTestMode(test_mode);
+    MemoryNormComponent *mc = dynamic_cast<MemoryNormComponent*>(comp);
+    if (mc != NULL)
+      mc->SetTestMode(test_mode);
   }
 }
 
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index d961b7cb6a0..b44b16b3606 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -169,10 +169,11 @@ void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 
 /// Returns true if nnet has at least one component of type
-/// BatchNormComponent.
+/// BatchNormComponent or MemoryNormComponent
 bool HasBatchnorm(const Nnet &nnet);
 
-/// This function affects only components of type BatchNormComponent.
+/// This function affects only components of type BatchNormComponent or
+/// MemoryNormComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed
 /// often).  "test mode" means that instead of using statistics from the batch,
@@ -445,8 +446,8 @@ void ApplyL2Regularization(const Nnet &nnet,
 
 /**
    This function scales the batchorm stats of any batchnorm components
-   (components of type BatchNormComponent) in 'nnet' by the scale
-   'batchnorm_stats_scale'.
+   (components of type BatchNormComponent or MemoryNormComponent) in 'nnet' by
+   the scale 'batchnorm_stats_scale'.
  */
 void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
                          Nnet *nnet);

From f400999a64143b67d88df0ba03f2b2b1d5a20772 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 10 Dec 2017 17:53:05 -0800
Subject: [PATCH 007/184] [src] Bug-fix in MemoryNormComponent

---
 src/nnet3/nnet-normalize-component.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index ac3817adfbe..ef6d7f4a6f3 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -1113,6 +1113,7 @@ void MemoryNormComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &backward_count_);
   ExpectToken(is, binary, "<Data>");
   data_.Read(is, binary);
+  ExpectToken(is, binary, "</MemoryNormComponent>");
   Check();
 }
 

From 5bb3870c75093360e21f57de2e55c0d8acbc76a2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 10 Dec 2017 19:43:29 -0800
Subject: [PATCH 008/184] [src] Fix bug in MemoryNormComponent

---
 src/nnet3/nnet-normalize-component.cc | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index ef6d7f4a6f3..62a8afc6472 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -867,15 +867,20 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   if (test_mode_) {
     if (stats_count_ <= 0.0)
       KALDI_ERR << "Test mode set but no stats available.";
-    CuSubVector<BaseFloat> x_mean(data_, 3), scale(data_, 4);
+    CuSubVector<BaseFloat> x_mean(data_, 0), scale(data_, 4);
     out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
     return NULL;
   } else {
     Memo *memo = GetMemo(in);
-    CuSubVector<BaseFloat> x_sum(memo->data, 0),
-        scale(memo->data, 2);
-    out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
+    if (stats_count_ <= 0.0) {
+      CuSubVector<BaseFloat> x_sum(memo->data, 0);
+      out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
+    } else { // use the mean stored with this object.
+      CuSubVector<BaseFloat> x_mean(data_, 0);
+      out->AddVecToRows(-1.0, x_mean);
+    }
+    CuSubVector<BaseFloat> scale(memo->data, 2);
     out->MulColsVec(scale);
     return memo;
   }

From 7e9cc29b0a9edb3c6e952c5997d61d3aafac1e2f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 10 Dec 2017 19:58:22 -0800
Subject: [PATCH 009/184] [src] Some reorganizations of MemoryNormComponent
 code

---
 src/nnet3/nnet-normalize-component.cc | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 62a8afc6472..7eca8594748 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -862,28 +862,26 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   if (out->Data() != in.Data())
     out->CopyFromMat(in);
 
+  if (test_mode_ && stats_count_ <= 0.0)
+      KALDI_ERR << "Test mode set but no stats available.";
+
   // From this point, we can assume that the num-cols of 'in' and 'out'
   // equals block_dim_.
-  if (test_mode_) {
-    if (stats_count_ <= 0.0)
-      KALDI_ERR << "Test mode set but no stats available.";
+  Memo *ans = NULL;
+  if (!test_mode_)
+    ans = GetMemo(in);
+
+  if (test_mode_ || stats_count_ > 0.0) {
     CuSubVector<BaseFloat> x_mean(data_, 0), scale(data_, 4);
     out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
-    return NULL;
   } else {
-    Memo *memo = GetMemo(in);
-    if (stats_count_ <= 0.0) {
-      CuSubVector<BaseFloat> x_sum(memo->data, 0);
-      out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
-    } else { // use the mean stored with this object.
-      CuSubVector<BaseFloat> x_mean(data_, 0);
-      out->AddVecToRows(-1.0, x_mean);
-    }
-    CuSubVector<BaseFloat> scale(memo->data, 2);
+    CuSubVector<BaseFloat> x_sum(memo->data, 0),
+        scale(memo->data, 2);
+    out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
     out->MulColsVec(scale);
-    return memo;
   }
+  return memo;
 }
 
 

From 39e6f777524b3270c1604c8134fd1d37362ca1f0 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Mon, 11 Dec 2017 17:58:40 -0500
Subject: [PATCH 010/184] changes according to the review

---
 src/chainbin/nnet3-chain-combine.cc | 133 +++---
 src/nnet3/nnet-chain-combine.cc     | 610 ----------------------------
 src/nnet3/nnet-chain-combine.h      | 205 ----------
 src/nnet3/nnet-combine.cc           | 606 ---------------------------
 src/nnet3/nnet-combine.h            | 251 ------------
 src/nnet3bin/nnet3-combine.cc       | 132 +++---
 6 files changed, 152 insertions(+), 1785 deletions(-)
 delete mode 100644 src/nnet3/nnet-chain-combine.cc
 delete mode 100644 src/nnet3/nnet-chain-combine.h
 delete mode 100644 src/nnet3/nnet-combine.cc
 delete mode 100644 src/nnet3/nnet-combine.h

diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 7dece5cb070..520575e1d88 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -28,44 +28,54 @@
 namespace kaldi {
 namespace nnet3 {
 
-double ComputeObjf(const std::vector<NnetChainExample> &egs,
+// Computes the objective of the moving average of nnet on egs. If either of
+// batchnorm/dropout test modes is true, we make a copy of the moving average,
+// set test modes on that and evaluate its objective. Note: the object that
+// prob_computer->nnet_ refers to should be moving_average_nnet.
+double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
+                   const std::vector<NnetChainExample> &egs,
+                   const Nnet &moving_average_nnet,
+                   const chain::ChainTrainingOptions &chain_config,
+                   const fst::StdVectorFst &den_fst,
                    NnetChainComputeProb *prob_computer) {
-  prob_computer->Reset();
-  std::vector<NnetChainExample>::const_iterator iter = egs.begin(),
-                                                 end = egs.end();
-  for (; iter != end; ++iter)
-    prob_computer->Compute(*iter);
-  const ChainObjectiveInfo *objf_info =
-      prob_computer->GetObjective("output");
-  if (objf_info == NULL)
-    KALDI_ERR << "Error getting objective info (unsuitable egs?)";
-  KALDI_ASSERT(objf_info->tot_weight > 0.0);
-  // we prefer to deal with normalized objective functions.
-  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
+  if (batchnorm_test_mode || dropout_test_mode) {
+    Nnet moving_average_nnet_copy(moving_average_nnet);
+    if (batchnorm_test_mode)
+      SetBatchnormTestMode(true, &moving_average_nnet_copy);
+    if (dropout_test_mode)
+      SetDropoutTestMode(true, &moving_average_nnet_copy);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetChainComputeProb prob_computer_test(compute_prob_opts, chain_config,
+        den_fst, moving_average_nnet_copy);
+    return ComputeObjf(false, false, egs, moving_average_nnet_copy,
+                       chain_config, den_fst, &prob_computer_test);
+  } else {
+    prob_computer->Reset();
+    std::vector<NnetChainExample>::const_iterator iter = egs.begin(),
+                                                   end = egs.end();
+    for (; iter != end; ++iter)
+      prob_computer->Compute(*iter);
+    const ChainObjectiveInfo *objf_info =
+        prob_computer->GetObjective("output");
+    if (objf_info == NULL)
+      KALDI_ERR << "Error getting objective info (unsuitable egs?)";
+    KALDI_ASSERT(objf_info->tot_weight > 0.0);
+    // inf/nan tot_objf->return -inf objective.
+    double tot_objf = objf_info->tot_like + objf_info->tot_l2_term;
+    if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0))
+      return -std::numeric_limits<double>::infinity();
+    // we prefer to deal with normalized objective functions.
+    return tot_objf / objf_info->tot_weight;
+  }
 }
 
-// Note: the object that prob_computer.nnet_ refers to should be
-// *moving_average_nnet.
-double UpdateNnetMovingAverageAndComputeObjf(int32 num_models,
-    const std::vector<NnetChainExample> &egs,
-    const Nnet &nnet, Nnet *moving_average_nnet,
-    NnetChainComputeProb *prob_computer) {
-  int32 num_params = NumParameters(nnet);
-  KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet));
-  Vector<BaseFloat> nnet_params(num_params, kUndefined),
-      moving_average_nnet_params(num_params, kUndefined);
-  VectorizeNnet(nnet, &nnet_params);
-  VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params);
-  moving_average_nnet_params.Scale((num_models - 1.0) / num_models);
-  moving_average_nnet_params.AddVec(1.0 / num_models, nnet_params);
-
-  BaseFloat sum = moving_average_nnet_params.Sum();
-  // inf/nan parameters->return -inf objective.
-  if (!(sum == sum && sum - sum == 0))
-    return -std::numeric_limits<double>::infinity();
-
-  UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet);
-  return ComputeObjf(egs, prob_computer);
+// Updates moving average over num_models nnets, given the average over
+// previous (num_models - 1) nnets, and the new nnet.
+void UpdateNnetMovingAverage(int32 num_models,
+    const Nnet &nnet, Nnet *moving_average_nnet) {
+  KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet));
+  ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet);
+  AddNnet(nnet, 1.0 / num_models, moving_average_nnet);
 }
 
 }
@@ -93,6 +103,7 @@ int main(int argc, char *argv[]) {
         " nnet3-combine den.fst 35.raw 36.raw 37.raw 38.raw ark:valid.cegs final.raw\n";
 
     bool binary_write = true;
+    int32 max_objective_evaluations = 30;
     bool batchnorm_test_mode = false,
         dropout_test_mode = true;
     std::string use_gpu = "yes";
@@ -100,13 +111,19 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("max-objective-evaluations", &max_objective_evaluations, "Max "
+                "number of objective evaluations in order to figure out the "
+                "best number of models to combine. It helps to speedup if "
+                "the number of models provided to this binary is quite large "
+                "(e.g. several hundred)."); 
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
-                "If true, set test-mode to true on any BatchNormComponents.");
+                "If true, set test-mode to true on any BatchNormComponents "
+                "while evaluating objectives.");
     po.Register("dropout-test-mode", &dropout_test_mode,
                 "If true, set test-mode to true on any DropoutComponents and "
-                "DropoutMaskComponents.");
+                "DropoutMaskComponents while evaluating objectives.");
 
     chain_config.Register(&po);
 
@@ -135,13 +152,8 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(raw_nnet_rxfilename, &nnet);
     Nnet moving_average_nnet(nnet), best_nnet(nnet);
     NnetComputeProbOptions compute_prob_opts;
-    NnetChainComputeProb *prob_computer = new NnetChainComputeProb(
-        compute_prob_opts, chain_config, den_fst, moving_average_nnet);
-
-    if (batchnorm_test_mode)
-      SetBatchnormTestMode(true, &nnet);
-    if (dropout_test_mode)
-      SetDropoutTestMode(true, &nnet);
+    NnetChainComputeProb prob_computer(compute_prob_opts, chain_config,
+        den_fst, moving_average_nnet);
 
     std::vector<NnetChainExample> egs;
     egs.reserve(10000);  // reserve a lot of space to minimize the chance of
@@ -156,26 +168,35 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(!egs.empty());
     }
 
-    int32 best_n = 1;
-    double best_objf = ComputeObjf(egs, prob_computer);
+    // first evaluates the objective using the last model.
+    int32 best_num_to_combine = 1;
+    double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+        egs, moving_average_nnet, chain_config, den_fst, &prob_computer);
     KALDI_LOG << "objective function using the last model is " << best_objf;
 
     int32 num_nnets = po.NumArgs() - 3;
-
+    // then each time before we re-evaluate the objective function, we will add
+    // num_to_add models to the moving average.
+    int32 num_to_add = (num_nnets + max_objective_evaluations - 1) /
+                       max_objective_evaluations;
     for (int32 n = 1; n < num_nnets; n++) {
       std::string this_nnet_rxfilename = po.GetArg(n + 2);
       ReadKaldiObject(this_nnet_rxfilename, &nnet);
-      double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet,
-          &moving_average_nnet, prob_computer);
-      KALDI_LOG << "Combining last " << n + 1
-                << " models, objective function is " << objf;
-      if (objf > best_objf) {
-        best_objf = objf;
-        best_nnet = moving_average_nnet;
-        best_n = n + 1;
+      // updates the moving average
+      UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+      if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
+        double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, chain_config, den_fst, &prob_computer);
+        KALDI_LOG << "Combining last " << n + 1
+                  << " models, objective function is " << objf;
+        if (objf > best_objf) {
+          best_objf = objf;
+          best_nnet = moving_average_nnet;
+          best_num_to_combine = n + 1;
+        }
       }
     }
-    KALDI_LOG << "Using the model averaged over last " << best_n
+    KALDI_LOG << "Using the model averaged over last " << best_num_to_combine
               << " models, objective function is " << best_objf;
 
     if (HasBatchnorm(nnet))
diff --git a/src/nnet3/nnet-chain-combine.cc b/src/nnet3/nnet-chain-combine.cc
deleted file mode 100644
index c93858fb06e..00000000000
--- a/src/nnet3/nnet-chain-combine.cc
+++ /dev/null
@@ -1,610 +0,0 @@
-// nnet3/nnet-chain-combine.cc
-
-// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet3/nnet-chain-combine.h"
-#include "nnet3/nnet-utils.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-NnetChainCombiner::NnetChainCombiner(const NnetCombineConfig &combine_config,
-                                     const chain::ChainTrainingOptions &chain_config,
-                                     int32 num_nnets,
-                                     const std::vector<NnetChainExample> &egs,
-                                     const fst::StdVectorFst &den_fst,
-                                     const Nnet &first_nnet):
-    combine_config_(combine_config),
-    chain_config_(chain_config),
-    egs_(egs),
-    den_fst_(den_fst),
-    nnet_(first_nnet),
-    num_real_input_nnets_(num_nnets),
-    nnet_params_(std::min(num_nnets, combine_config_.max_effective_inputs),
-                 NumParameters(first_nnet)),
-    tot_input_weighting_(nnet_params_.NumRows()) {
-
-  if (combine_config_.sum_to_one_penalty != 0.0 &&
-      combine_config_.enforce_sum_to_one) {
-    KALDI_WARN << "--sum-to-one-penalty=" << combine_config_.sum_to_one_penalty
-              << " is nonzero, so setting --enforce-sum-to-one=false.";
-    combine_config_.enforce_sum_to_one = false;
-  }
-  SubVector<BaseFloat> first_params(nnet_params_, 0);
-  VectorizeNnet(nnet_, &first_params);
-  tot_input_weighting_(0) += 1.0;
-  num_nnets_provided_ = 1;
-  ComputeUpdatableComponentDims();
-  NnetComputeProbOptions compute_prob_opts;
-  compute_prob_opts.compute_deriv = true;
-  prob_computer_ = new NnetChainComputeProb(compute_prob_opts, chain_config_, den_fst_, nnet_);
-}
-
-void NnetChainCombiner::ComputeUpdatableComponentDims(){
-  updatable_component_dims_.clear();
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    Component *comp = nnet_.GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      updatable_component_dims_.push_back(uc->NumParameters());
-    }
-  }
-}
-
-void NnetChainCombiner::AcceptNnet(const Nnet &nnet) {
-  KALDI_ASSERT(num_nnets_provided_ < num_real_input_nnets_ &&
-               "You called AcceptNnet too many times.");
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  if (num_effective_nnets == num_real_input_nnets_) {
-    SubVector<BaseFloat> this_params(nnet_params_, num_nnets_provided_);
-    VectorizeNnet(nnet, &this_params);
-    tot_input_weighting_(num_nnets_provided_) += 1.0;
-  } else {
-    // this_index is a kind of warped index, mapping the range
-    // 0 ... num_real_inputs_nnets_ - 1 onto the range
-    // 0 ... num_effective_nnets - 1.  View the index as falling in
-    // between two integer indexes and determining weighting factors.
-    // we could view this as triangular bins.
-    BaseFloat this_index = num_nnets_provided_ * (num_effective_nnets - 1)
-        / static_cast<BaseFloat>(num_real_input_nnets_ - 1);
-    int32 lower_index = std::floor(this_index),
-        upper_index = lower_index + 1;
-    BaseFloat remaining_part = this_index - lower_index,
-        lower_weight = 1.0 - remaining_part,
-        upper_weight = remaining_part;
-    KALDI_ASSERT(lower_index >= 0 && upper_index <= num_effective_nnets &&
-                 lower_weight >= 0.0 && upper_weight >= 0.0 &&
-                 lower_weight <= 1.0 && upper_weight <= 1.0);
-    Vector<BaseFloat> vec(nnet_params_.NumCols(), kUndefined);
-    VectorizeNnet(nnet, &vec);
-    nnet_params_.Row(lower_index).AddVec(lower_weight, vec);
-    tot_input_weighting_(lower_index) += lower_weight;
-    if (upper_index == num_effective_nnets) {
-      KALDI_ASSERT(upper_weight < 0.1);
-    } else {
-      nnet_params_.Row(upper_index).AddVec(upper_weight, vec);
-      tot_input_weighting_(upper_index) += upper_weight;
-    }
-  }
-  num_nnets_provided_++;
-}
-
-void NnetChainCombiner::FinishPreprocessingInput() {
-  KALDI_ASSERT(num_nnets_provided_ == num_real_input_nnets_ &&
-               "You did not call AcceptInput() enough times.");
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  for (int32 i = 0; i < num_effective_nnets; i++) {
-    BaseFloat tot_weight = tot_input_weighting_(i);
-    KALDI_ASSERT(tot_weight > 0.0);  // Or would be a coding error.
-    // Rescale so this row is like a weighted average instead of
-    // a weighted sum.
-    if (tot_weight != 1.0)
-      nnet_params_.Row(i).Scale(1.0 / tot_weight);
-  }
-}
-
-void NnetChainCombiner::Combine() {
-  FinishPreprocessingInput();
-
-  if (!SelfTestDerivatives()) {
-    KALDI_LOG << "Self-testing model derivatives since parameter-derivatives "
-        "self-test failed.";
-    SelfTestModelDerivatives();
-  }
-
-  int32 dim = ParameterDim();
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-                         // itself, so this is BFGS.
-  lbfgs_options.first_step_impr = combine_config_.initial_impr;
-
-  Vector<double> params(dim), deriv(dim);
-  double objf, initial_objf;
-  GetInitialParameters(&params);
-
-
-  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
-
-  for (int32 i = 0; i < combine_config_.num_iters; i++) {
-    params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndDerivFromParameters(params, &deriv);
-    KALDI_VLOG(2) << "Iteration " << i << " params = " << params
-                  << ", objf = " << objf << ", deriv = " << deriv;
-    if (i == 0) initial_objf = objf;
-    lbfgs.DoStep(objf, deriv);
-  }
-
-  if (!combine_config_.sum_to_one_penalty) {
-    KALDI_LOG << "Combining nnets, objective function changed from "
-              << initial_objf << " to " << objf;
-  } else {
-    Vector<double> weights(WeightDim());
-    GetWeights(params, &weights);
-    bool print_weights = true;
-    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
-    // note: initial_objf has no penalty term because it summed exactly
-    // to one.
-    KALDI_LOG << "Combining nnets, objective function changed from "
-              << initial_objf << " to " << objf << " = "
-              << (objf - penalty) << " + " << penalty;
-  }
-
-
-  // must recompute nnet_ if "params" is not exactly equal to the
-  // final params that LB
-  Vector<double> final_params(dim);
-  final_params.CopyFromVec(lbfgs.GetValue(&objf));
-  if (!params.ApproxEqual(final_params, 0.0)) {
-    // the following call makes sure that nnet_ corresponds to the parameters
-    // in "params".
-    ComputeObjfAndDerivFromParameters(final_params, &deriv);
-  }
-  PrintParams(final_params);
-}
-
-
-void NnetChainCombiner::PrintParams(const VectorBase<double> &params) const {
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
-  GetWeights(params, &weights);
-  GetNormalizedWeights(weights, &normalized_weights);
-  int32 num_models = nnet_params_.NumRows(),
-      num_uc = NumUpdatableComponents();
-
-  if (combine_config_.separate_weights_per_component) {
-    std::vector<std::string> updatable_component_names;
-    for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-      const Component *comp = nnet_.GetComponent(c);
-      if (comp->Properties() & kUpdatableComponent)
-        updatable_component_names.push_back(nnet_.GetComponentName(c));
-    }
-    KALDI_ASSERT(static_cast<int32>(updatable_component_names.size()) ==
-                 NumUpdatableComponents());
-    for (int32 uc = 0; uc < num_uc; uc++) {
-      std::ostringstream os;
-      os.width(20);
-      os << std::left << updatable_component_names[uc] << ": ";
-      os.width(9);
-      os.precision(4);
-      for (int32 m = 0; m < num_models; m++) {
-        int32 index = m * num_uc + uc;
-        os << " " << std::left << normalized_weights(index);
-      }
-      KALDI_LOG << "Weights for " << os.str();
-    }
-  } else {
-    int32 c = 0;  // arbitrarily chosen; they'll all be the same.
-    std::ostringstream os;
-    os.width(9);
-    os.precision(4);
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      os << " " <<  std::left << normalized_weights(index);
-    }
-    KALDI_LOG << "Model weights are: " << os.str();
-  }
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  if (num_effective_nnets != num_real_input_nnets_)
-    KALDI_LOG << "Above, only " << num_effective_nnets << " weights were "
-              "printed due to the the --num-effective-nnets option; "
-              "there were " << num_real_input_nnets_ << " actual input nnets. "
-              "Each weight corresponds to a weighted average over a range of "
-              "nnets in the sequence (with triangular bins)";
-}
-
-bool NnetChainCombiner::SelfTestDerivatives() {
-  int32 num_tests = 2;  // more properly, this is the number of dimensions in a
-                        // single test.
-  double delta = 0.001;
-  int32 dim = ParameterDim();
-
-  Vector<double> params(dim), deriv(dim);
-  Vector<double> predicted_changes(num_tests),
-      observed_changes(num_tests);
-
-  GetInitialParameters(&params);
-  double initial_objf = ComputeObjfAndDerivFromParameters(params,
-                                                             &deriv);
-  for (int32 i = 0; i < num_tests; i++) {
-    Vector<double> new_deriv(dim), offset(dim), new_params(params);
-    offset.SetRandn();
-    new_params.AddVec(delta, offset);
-    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
-                                                           &new_deriv);
-    // for predicted changes, interpolate old and new derivs.
-    predicted_changes(i) =
-        0.5 * VecVec(new_params, deriv) -  0.5 * VecVec(params, deriv) +
-        0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
-    observed_changes(i) = new_objf - initial_objf;
-  }
-  double threshold = 0.1;
-  KALDI_LOG << "predicted_changes = " << predicted_changes;
-  KALDI_LOG << "observed_changes = " << observed_changes;
-  if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
-    KALDI_WARN << "Derivatives self-test failed.";
-    return false;
-  } else {
-    return true;
-  }
-}
-
-
-void NnetChainCombiner::SelfTestModelDerivatives() {
-  int32 num_tests = 3;  // more properly, this is the number of dimensions in a
-                        // single test.
-  int32 dim = ParameterDim();
-
-  Vector<double> params(dim), deriv(dim);
-  Vector<double> predicted_changes(num_tests),
-      observed_changes(num_tests);
-
-  GetInitialParameters(&params);
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
-  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
-      nnet_deriv(NnetParameterDim(), kUndefined);
-  GetWeights(params, &weights);
-  GetNormalizedWeights(weights, &normalized_weights);
-  GetNnetParameters(normalized_weights, &nnet_params);
-
-  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
-                                                       &nnet_deriv);
-
-  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                   NnetParameterDim());
-
-
-  for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_nnet_deriv(NnetParameterDim()),
-        offset(NnetParameterDim()), new_nnet_params(nnet_params);
-    offset.SetRandn();
-    new_nnet_params.AddVec(delta, offset);
-    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
-                                                     &new_nnet_deriv);
-    // for predicted changes, interpolate old and new derivs.
-    predicted_changes(i) =
-        0.5 * VecVec(new_nnet_params, nnet_deriv) -
-        0.5 * VecVec(nnet_params, nnet_deriv) +
-        0.5 * VecVec(new_nnet_params, new_nnet_deriv) -
-        0.5 * VecVec(nnet_params, new_nnet_deriv);
-    observed_changes(i) = new_objf - initial_objf;
-  }
-  double threshold = 0.1;
-  KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
-  KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
-  if (!ApproxEqual(predicted_changes, observed_changes, threshold))
-    KALDI_WARN << "Model derivatives self-test failed.";
-}
-
-
-
-
-int32 NnetChainCombiner::ParameterDim() const {
-  if (combine_config_.separate_weights_per_component)
-    return NumUpdatableComponents() * nnet_params_.NumRows();
-  else
-    return nnet_params_.NumRows();
-}
-
-
-void NnetChainCombiner::GetInitialParameters(VectorBase<double> *params) const {
-  KALDI_ASSERT(params->Dim() == ParameterDim());
-  params->Set(1.0 / nnet_params_.NumRows());
-  if (combine_config_.enforce_positive_weights) {
-    // we enforce positive weights by treating the params as the log of the
-    // actual weight.
-    params->ApplyLog();
-  }
-}
-
-void NnetChainCombiner::GetWeights(const VectorBase<double> &params,
-                              VectorBase<double> *weights) const {
-  KALDI_ASSERT(weights->Dim() == WeightDim());
-  if (combine_config_.separate_weights_per_component) {
-    weights->CopyFromVec(params);
-  } else {
-    int32 nc = NumUpdatableComponents();
-    // have one parameter per row of nnet_params_, and need to repeat
-    // the weight for the different components.
-    for (int32 n = 0; n < nnet_params_.NumRows(); n++) {
-      for (int32 c = 0; c < nc; c++)
-        (*weights)(n * nc + c) = params(n);
-    }
-  }
-  // we enforce positive weights by having the weights be the exponential of the
-  // corresponding parameters.
-  if (combine_config_.enforce_positive_weights)
-    weights->ApplyExp();
-}
-
-
-void NnetChainCombiner::GetParamsDeriv(const VectorBase<double> &weights,
-                                  const VectorBase<double> &weights_deriv,
-                                  VectorBase<double> *param_deriv) {
-  KALDI_ASSERT(weights.Dim() == WeightDim() &&
-               param_deriv->Dim() == ParameterDim());
-  Vector<double> preexp_weights_deriv(weights_deriv);
-  if (combine_config_.enforce_positive_weights) {
-    // to enforce positive weights we first compute weights (call these
-    // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
-    // So the derivative w.r.t. the preexp_weights equals the derivative
-    // w.r.t. the weights, times the weights.
-    preexp_weights_deriv.MulElements(weights);
-  }
-  if (combine_config_.separate_weights_per_component) {
-    param_deriv->CopyFromVec(preexp_weights_deriv);
-  } else {
-    int32 nc = NumUpdatableComponents();
-    param_deriv->SetZero();
-    for (int32 n = 0; n < nnet_params_.NumRows(); n++)
-      for (int32 c = 0; c < nc; c++)
-        (*param_deriv)(n) += preexp_weights_deriv(n * nc + c);
-  }
-}
-
-double NnetChainCombiner::GetSumToOnePenalty(
-    const VectorBase<double> &weights,
-    VectorBase<double> *weights_penalty_deriv,
-    bool print_weights) const {
-
-  KALDI_ASSERT(combine_config_.sum_to_one_penalty >= 0.0);
-  double penalty = combine_config_.sum_to_one_penalty;
-  if (penalty == 0.0) {
-    weights_penalty_deriv->SetZero();
-    return 0.0;
-  }
-  double ans = 0.0;
-  int32 num_uc = NumUpdatableComponents(),
-    num_models = nnet_params_.NumRows();
-  Vector<double> tot_weights(num_uc);
-  std::ostringstream tot_weight_info;
-  for (int32 c = 0; c < num_uc; c++) {
-    double this_total_weight = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      double this_weight = weights(index);
-      this_total_weight += this_weight;
-    }
-    tot_weights(c) = this_total_weight;
-    // this_total_weight_deriv is the derivative of the penalty
-    // term w.r.t. this component's total weight.
-    double this_total_weight_deriv;
-    if (combine_config_.enforce_positive_weights) {
-      // if combine_config_.enforce_positive_weights is true, then we choose to
-      // formulate the penalty in a slightly different way.. this solves the
-      // problem that with the formulation in the 'else' below, if for some
-      // reason the total weight is << 1.0, the deriv w.r.t. the actual
-      // parameters gets tiny [because weight = exp(params)].
-      double log_total = log(this_total_weight);
-      ans += -0.5 * penalty * log_total * log_total;
-      double log_total_deriv = -1.0 * penalty * log_total;
-      this_total_weight_deriv = log_total_deriv / this_total_weight;
-    } else {
-      ans += -0.5 * penalty *
-             (this_total_weight - 1.0) * (this_total_weight - 1.0);
-      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
-
-    }
-    if (weights_penalty_deriv != NULL) {
-      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
-      for (int32 m = 0; m < num_models; m++) {
-        int32 index = m * num_uc + c;
-        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
-      }
-    }
-  }
-  if (print_weights) {
-    Vector<BaseFloat> tot_weights_float(tot_weights);
-    KALDI_LOG << "Total weights per component: "
-              << PrintVectorPerUpdatableComponent(nnet_,
-                                                  tot_weights_float);
-  }
-  return ans;
-}
-
-void NnetChainCombiner::GetNnetParameters(const Vector<double> &weights,
-                                     VectorBase<BaseFloat> *nnet_params) const {
-  KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
-  nnet_params->SetZero();
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 m = 0; m < num_models; m++) {
-    const SubVector<BaseFloat> src_params(nnet_params_, m);
-    int32 dim_offset = 0;
-    for (int32 c = 0; c < num_uc; c++) {
-      int32 index = m * num_uc + c;
-      BaseFloat weight = weights(index);
-      int32 dim = updatable_component_dims_[c];
-      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
-                                                      dim);
-      SubVector<BaseFloat> dest_component_params(*nnet_params, dim_offset, dim);
-      dest_component_params.AddVec(weight, src_component_params);
-      dim_offset += dim;
-    }
-    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
-  }
-}
-
-// compare GetNnetParameters.
-void NnetChainCombiner::GetWeightsDeriv(
-    const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<double> *weights_deriv) {
-  KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
-               weights_deriv->Dim() == WeightDim());
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 m = 0; m < num_models; m++) {
-    const SubVector<BaseFloat> src_params(nnet_params_, m);
-    int32 dim_offset = 0;
-    for (int32 c = 0; c < num_uc; c++) {
-      int32 index = m * num_uc + c;
-      int32 dim = updatable_component_dims_[c];
-      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
-                                                      dim);
-      const SubVector<BaseFloat> component_params_deriv(nnet_params_deriv,
-                                                        dim_offset, dim);
-      (*weights_deriv)(index) = VecVec(src_component_params,
-                                       component_params_deriv);
-      dim_offset += dim;
-    }
-    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
-  }
-}
-
-double NnetChainCombiner::ComputeObjfAndDerivFromNnet(
-    VectorBase<BaseFloat> &nnet_params,
-    VectorBase<BaseFloat> *nnet_params_deriv) {
-  BaseFloat sum = nnet_params.Sum();
-  // inf/nan parameters->return -inf objective.
-  if (!(sum == sum && sum - sum == 0))
-    return -std::numeric_limits<double>::infinity();
-  // Set nnet to have these params.
-  UnVectorizeNnet(nnet_params, &nnet_);
-
-  prob_computer_->Reset();
-  std::vector<NnetChainExample>::const_iterator iter = egs_.begin(),
-                                                end = egs_.end();
-  for (; iter != end; ++iter)
-    prob_computer_->Compute(*iter);
-  const ChainObjectiveInfo *objf_info =
-      prob_computer_->GetObjective("output");
-  if (objf_info == NULL)
-    KALDI_ERR << "Error getting objective info (unsuitable egs?)";
-  KALDI_ASSERT(objf_info->tot_weight > 0.0);
-  const Nnet &deriv = prob_computer_->GetDeriv();
-  VectorizeNnet(deriv, nnet_params_deriv);
-  // we prefer to deal with normalized objective functions.
-  nnet_params_deriv->Scale(1.0 / objf_info->tot_weight);
-  return (objf_info->tot_like + objf_info->tot_l2_term) / objf_info->tot_weight;
-}
-
-
-double NnetChainCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<double> &params,
-    VectorBase<double> *params_deriv) {
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
-      weights_sum_to_one_penalty_deriv(WeightDim()),
-      normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
-  Vector<BaseFloat>
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined);
-  GetWeights(params, &weights);
-  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
-  GetNormalizedWeights(weights, &normalized_weights);
-  GetNnetParameters(normalized_weights, &nnet_params);
-  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
-  if (ans != ans || ans - ans != 0) // NaN or inf
-    return ans;  // No point computing derivative
-  GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
-  GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
-                              &weights_deriv);
-  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
-  GetParamsDeriv(weights, weights_deriv, params_deriv);
-  return ans;
-}
-
-
-// enforces the constraint that the weights for each component must sum to one,
-// if necessary.
-void NnetChainCombiner::GetNormalizedWeights(
-    const VectorBase<double> &unnorm_weights,
-    VectorBase<double> *norm_weights) const {
-  if (!combine_config_.enforce_sum_to_one) {
-    norm_weights->CopyFromVec(unnorm_weights);
-    return;
-  }
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 c = 0; c < num_uc; c++) {
-    double sum = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      sum += unnorm_weights(index);
-    }
-    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
-                                    // weights and eventually -inf objective.
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
-    }
-  }
-}
-
-void NnetChainCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<double> &unnorm_weights,
-    const VectorBase<double> &norm_weights_deriv,
-    VectorBase<double> *unnorm_weights_deriv) {
-  if (!combine_config_.enforce_sum_to_one) {
-    unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
-    return;
-  }
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 c = 0; c < num_uc; c++) {
-    double sum = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      sum += unnorm_weights(index);
-    }
-    double inv_sum = 1.0 / sum;
-    double inv_sum_deriv = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      // in the forward direction, we'd do:
-      // (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
-      (*unnorm_weights_deriv)(index) = inv_sum * norm_weights_deriv(index);
-      inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
-    }
-    // note: d/dx (1/x) = -1/x^2
-    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      (*unnorm_weights_deriv)(index) += sum_deriv;
-    }
-  }
-}
-
-
-
-
-} // namespace nnet3
-} // namespace kaldi
diff --git a/src/nnet3/nnet-chain-combine.h b/src/nnet3/nnet-chain-combine.h
deleted file mode 100644
index 3aeb3882650..00000000000
--- a/src/nnet3/nnet-chain-combine.h
+++ /dev/null
@@ -1,205 +0,0 @@
-// nnet3/nnet-chain-combine.h
-
-// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET3_NNET_CHAIN_COMBINE_H_
-#define KALDI_NNET3_NNET_CHAIN_COMBINE_H_
-
-#include "nnet3/nnet-utils.h"
-#include "nnet3/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-#include "nnet3/nnet-combine.h"
-#include "nnet3/nnet-chain-diagnostics.h"
-
-
-namespace kaldi {
-namespace nnet3 {
-
-// we re-use NnetCombineConfig from nnet-combine.h
-
-/*
-  You should use this class as follows:
-    - Call the constructor, giving it the egs and the first nnet.
-    - Call AcceptNnet to provide all the other nnets.  (the nnets will
-      be stored in a matrix in CPU memory, to avoid filing up GPU memory).
-    - Call Combine()
-    - Get the resultant nnet with GetNnet().
- */
-class NnetChainCombiner {
- public:
-  /// Caution: this object retains a const reference to the "egs", so don't
-  /// delete them until it goes out of scope.
-  NnetChainCombiner(const NnetCombineConfig &nnet_config,
-                    const chain::ChainTrainingOptions &chain_config,
-                    int32 num_nnets,
-                    const std::vector<NnetChainExample> &egs,
-                    const fst::StdVectorFst &den_fst,
-                    const Nnet &first_nnet);
-
-  /// You should call this function num_nnets-1 times after calling
-  /// the constructor, to provide the remaining nnets.
-  void AcceptNnet(const Nnet &nnet);
-
-  void Combine();
-
-  const Nnet &GetNnet() const { return nnet_; }
-
-  ~NnetChainCombiner() { delete prob_computer_; }
- private:
-  NnetCombineConfig combine_config_;
-  const chain::ChainTrainingOptions &chain_config_;
-
-  const std::vector<NnetChainExample> &egs_;
-
-  const fst::StdVectorFst &den_fst_;
-
-  Nnet nnet_;  // The current neural network.
-
-  NnetChainComputeProb *prob_computer_;
-
-  std::vector<int32> updatable_component_dims_;  // dimension of each updatable
-                                                 // component.
-
-  int32 num_real_input_nnets_;  // number of actual nnet inputs.
-
-  int32 num_nnets_provided_;  // keeps track of the number of calls to AcceptNnet().
-
-  // nnet_params_ are the parameters of the "effective input"
-  // neural nets; they will often be the same as the real inputs,
-  // but if num_real_input_nnets_ > config_.num_effective_nnets, they
-  // will be weighted combinations.
-  Matrix<BaseFloat> nnet_params_;
-
-  // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params corresponds to
-  // a weighted average of its inputs (will be all ones if
-  // config_.max_effective_inputs >= the number of nnets provided).
-  Vector<BaseFloat> tot_input_weighting_;
-
-  // returns the parameter dimension, i.e. the dimension of the parameters that
-  // we are optimizing.  This depends on the config, the number of updatable
-  // components and nnet_params_.NumRows(); it will never exceed the number of
-  // updatable components times nnet_params_.NumRows().
-  int32 ParameterDim() const;
-
-  int32 NumUpdatableComponents() const {
-    return updatable_component_dims_.size();
-  }
-  // returns the weight dimension.
-  int32 WeightDim() const {
-    return nnet_params_.NumRows() * NumUpdatableComponents();
-  }
-
-  int32 NnetParameterDim() const { return nnet_params_.NumCols(); }
-
-  // Computes the initial parameters.  The parameters are the underlying thing
-  // that we optimize; their dimension equals ParameterDim().  They are not the same
-  // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<double> *params) const;
-
-  // Tests that derivatives are accurate.  Prints warning and returns false if not.
-  bool SelfTestDerivatives();
-
-  // Tests that model derivatives are accurate.  Just prints warning if not.
-  void SelfTestModelDerivatives();
-
-
-  // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<double> &params) const;
-
-  // This function computes the objective function (and its derivative, if the objective
-  // function is finite) at the given value of the parameters (the parameters we're optimizing,
-  // i.e. the combination weights; not the nnet parameters.  This function calls most of the
-  // functions below.
-  double ComputeObjfAndDerivFromParameters(
-      VectorBase<double> &params,
-      VectorBase<double> *params_deriv);
-
-
-  // Computes the weights from the parameters in a config-dependent way.  The
-  // weight dimension is always (the number of updatable components times
-  // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<double> &params,
-                  VectorBase<double> *weights) const;
-
-  // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
-  // with sum-to-one constrint per component included; else just copy input to
-  // output.
-  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
-                            VectorBase<double> *norm_weights) const;
-
-  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
-  // weights_penalty_deriv to 0.0; else it computes, for each
-  // updatable component u the total weight w_u, returns the value
-  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
-  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
-  // the result.
-  // Note: config_.sum_to_one_penalty is exclusive with
-  // config_.enforce_sum_to_one, so there is really no distinction between
-  // normalized and unnormalized weights here (since normalization would be a
-  // no-op).
-  double GetSumToOnePenalty(const VectorBase<double> &weights,
-                            VectorBase<double> *weights_penalty_deriv,
-                            bool print_weights = false) const;
-
-
-  // Computes the nnet-parameter vector from the normalized weights and
-  // nnet_params_, as a vector.  (See the functions Vectorize() and
-  // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<double> &normalized_weights,
-                         VectorBase<BaseFloat> *nnet_params) const;
-
-  // This function computes the objective function (and its derivative, if the objective
-  // function is finite) at the given value of nnet parameters.  This involves the
-  // nnet computation.
-  double ComputeObjfAndDerivFromNnet(VectorBase<BaseFloat> &nnet_params,
-                                     VectorBase<BaseFloat> *nnet_params_deriv);
-
-  // Given an objective-function derivative with respect to the nnet parameters,
-  // computes the derivative with respect to the (normalized) weights.
-  void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<double> *normalized_weights_deriv);
-
-
-  // Computes the derivative w.r.t. the unnormalized weights, by propagating
-  // through the normalization operation.
-  // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
-  // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
-                                   const VectorBase<double> &norm_weights_deriv,
-                                   VectorBase<double> *unnorm_weights_deriv);
-
-
-  // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
-  // the params
-  void GetParamsDeriv(const VectorBase<double> &weights,
-                      const VectorBase<double> &weight_deriv,
-                      VectorBase<double> *param_deriv);
-
-  void ComputeUpdatableComponentDims();
-  void FinishPreprocessingInput();
-
-};
-
-
-
-} // namespace nnet3
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet3/nnet-combine.cc b/src/nnet3/nnet-combine.cc
deleted file mode 100644
index fa570ec96a3..00000000000
--- a/src/nnet3/nnet-combine.cc
+++ /dev/null
@@ -1,606 +0,0 @@
-// nnet3/nnet-combine.cc
-
-// Copyright 2012-2015   Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "nnet3/nnet-combine.h"
-#include "nnet3/nnet-utils.h"
-
-namespace kaldi {
-namespace nnet3 {
-
-NnetCombiner::NnetCombiner(const NnetCombineConfig &config,
-                           int32 num_nnets,
-                           const std::vector<NnetExample> &egs,
-                           const Nnet &first_nnet):
-    config_(config),
-    egs_(egs),
-    nnet_(first_nnet),
-    num_real_input_nnets_(num_nnets),
-    nnet_params_(std::min(num_nnets, config_.max_effective_inputs),
-                 NumParameters(first_nnet)),
-    tot_input_weighting_(nnet_params_.NumRows()) {
-
-  if (config_.sum_to_one_penalty != 0.0 &&
-      config_.enforce_sum_to_one) {
-    KALDI_WARN << "--sum-to-one-penalty=" << config_.sum_to_one_penalty
-              << " is nonzero, so setting --enforce-sum-to-one=false.";
-    config_.enforce_sum_to_one = false;
-  }
-  SubVector<BaseFloat> first_params(nnet_params_, 0);
-  VectorizeNnet(nnet_, &first_params);
-  tot_input_weighting_(0) += 1.0;
-  num_nnets_provided_ = 1;
-  ComputeUpdatableComponentDims();
-  NnetComputeProbOptions compute_prob_opts;
-  compute_prob_opts.compute_deriv = true;
-  prob_computer_ = new NnetComputeProb(compute_prob_opts, nnet_);
-}
-
-void NnetCombiner::ComputeUpdatableComponentDims(){
-  updatable_component_dims_.clear();
-  for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-    Component *comp = nnet_.GetComponent(c);
-    if (comp->Properties() & kUpdatableComponent) {
-      // For now all updatable components inherit from class UpdatableComponent.
-      // If that changes in future, we will change this code.
-      UpdatableComponent *uc = dynamic_cast<UpdatableComponent*>(comp);
-      if (uc == NULL)
-        KALDI_ERR << "Updatable component does not inherit from class "
-            "UpdatableComponent; change this code.";
-      updatable_component_dims_.push_back(uc->NumParameters());
-    }
-  }
-}
-
-void NnetCombiner::AcceptNnet(const Nnet &nnet) {
-  KALDI_ASSERT(num_nnets_provided_ < num_real_input_nnets_ &&
-               "You called AcceptNnet too many times.");
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  if (num_effective_nnets == num_real_input_nnets_) {
-    SubVector<BaseFloat> this_params(nnet_params_, num_nnets_provided_);
-    VectorizeNnet(nnet, &this_params);
-    tot_input_weighting_(num_nnets_provided_) += 1.0;
-  } else {
-    // this_index is a kind of warped index, mapping the range
-    // 0 ... num_real_inputs_nnets_ - 1 onto the range
-    // 0 ... num_effective_nnets - 1.  View the index as falling in
-    // between two integer indexes and determining weighting factors.
-    // we could view this as triangular bins.
-    BaseFloat this_index = num_nnets_provided_ * (num_effective_nnets - 1)
-        / static_cast<BaseFloat>(num_real_input_nnets_ - 1);
-    int32 lower_index = std::floor(this_index),
-        upper_index = lower_index + 1;
-    BaseFloat remaining_part = this_index - lower_index,
-        lower_weight = 1.0 - remaining_part,
-        upper_weight = remaining_part;
-    KALDI_ASSERT(lower_index >= 0 && upper_index <= num_effective_nnets &&
-                 lower_weight >= 0.0 && upper_weight >= 0.0 &&
-                 lower_weight <= 1.0 && upper_weight <= 1.0);
-    Vector<BaseFloat> vec(nnet_params_.NumCols(), kUndefined);
-    VectorizeNnet(nnet, &vec);
-    nnet_params_.Row(lower_index).AddVec(lower_weight, vec);
-    tot_input_weighting_(lower_index) += lower_weight;
-    if (upper_index == num_effective_nnets) {
-      KALDI_ASSERT(upper_weight < 0.1);
-    } else {
-      nnet_params_.Row(upper_index).AddVec(upper_weight, vec);
-      tot_input_weighting_(upper_index) += upper_weight;
-    }
-  }
-  num_nnets_provided_++;
-}
-
-void NnetCombiner::FinishPreprocessingInput() {
-  KALDI_ASSERT(num_nnets_provided_ == num_real_input_nnets_ &&
-               "You did not call AcceptInput() enough times.");
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  for (int32 i = 0; i < num_effective_nnets; i++) {
-    BaseFloat tot_weight = tot_input_weighting_(i);
-    KALDI_ASSERT(tot_weight > 0.0);  // Or would be a coding error.
-    // Rescale so this row is like a weighted average instead of
-    // a weighted sum.
-    if (tot_weight != 1.0)
-      nnet_params_.Row(i).Scale(1.0 / tot_weight);
-  }
-}
-
-void NnetCombiner::Combine() {
-  FinishPreprocessingInput();
-
-  if (!SelfTestDerivatives()) {
-    KALDI_LOG << "Self-testing model derivatives since parameter-derivatives "
-        "self-test failed.";
-    SelfTestModelDerivatives();
-  }
-
-  int32 dim = ParameterDim();
-  LbfgsOptions lbfgs_options;
-  lbfgs_options.minimize = false; // We're maximizing.
-  lbfgs_options.m = dim; // Store the same number of vectors as the dimension
-                         // itself, so this is BFGS.
-  lbfgs_options.first_step_impr = config_.initial_impr;
-
-  Vector<double> params(dim), deriv(dim);
-  double objf, initial_objf;
-  GetInitialParameters(&params);
-
-
-  OptimizeLbfgs<double> lbfgs(params, lbfgs_options);
-
-  for (int32 i = 0; i < config_.num_iters; i++) {
-    params.CopyFromVec(lbfgs.GetProposedValue());
-    objf = ComputeObjfAndDerivFromParameters(params, &deriv);
-    KALDI_VLOG(2) << "Iteration " << i << " params = " << params
-                  << ", objf = " << objf << ", deriv = " << deriv;
-    if (i == 0) initial_objf = objf;
-    lbfgs.DoStep(objf, deriv);
-  }
-
-  if (!config_.sum_to_one_penalty) {
-    KALDI_LOG << "Combining nnets, objective function changed from "
-              << initial_objf << " to " << objf;
-  } else {
-    Vector<double> weights(WeightDim());
-    GetWeights(params, &weights);
-    bool print_weights = true;
-    double penalty = GetSumToOnePenalty(weights, NULL, print_weights);
-    // note: initial_objf has no penalty term because it summed exactly
-    // to one.
-    KALDI_LOG << "Combining nnets, objective function changed from "
-              << initial_objf << " to " << objf << " = "
-              << (objf - penalty) << " + " << penalty;
-  }
-
-
-  // must recompute nnet_ if "params" is not exactly equal to the
-  // final params that LB
-  Vector<double> final_params(dim);
-  final_params.CopyFromVec(lbfgs.GetValue(&objf));
-  if (!params.ApproxEqual(final_params, 0.0)) {
-    // the following call makes sure that nnet_ corresponds to the parameters
-    // in "params".
-    ComputeObjfAndDerivFromParameters(final_params, &deriv);
-  }
-  PrintParams(final_params);
-
-}
-
-void NnetCombiner::PrintParams(const VectorBase<double> &params) const {
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
-  GetWeights(params, &weights);
-  GetNormalizedWeights(weights, &normalized_weights);
-  int32 num_models = nnet_params_.NumRows(),
-      num_uc = NumUpdatableComponents();
-
-  if (config_.separate_weights_per_component) {
-    std::vector<std::string> updatable_component_names;
-    for (int32 c = 0; c < nnet_.NumComponents(); c++) {
-      const Component *comp = nnet_.GetComponent(c);
-      if (comp->Properties() & kUpdatableComponent)
-        updatable_component_names.push_back(nnet_.GetComponentName(c));
-    }
-    KALDI_ASSERT(static_cast<int32>(updatable_component_names.size()) ==
-                 NumUpdatableComponents());
-    for (int32 uc = 0; uc < num_uc; uc++) {
-      std::ostringstream os;
-      os.width(20);
-      os << std::left << updatable_component_names[uc] << ": ";
-      os.width(9);
-      os.precision(4);
-      for (int32 m = 0; m < num_models; m++) {
-        int32 index = m * num_uc + uc;
-        os << " " << std::left << normalized_weights(index);
-      }
-      KALDI_LOG << "Weights for " << os.str();
-    }
-  } else {
-    int32 c = 0;  // arbitrarily chosen; they'll all be the same.
-    std::ostringstream os;
-    os.width(9);
-    os.precision(4);
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      os << " " <<  std::left << normalized_weights(index);
-    }
-    KALDI_LOG << "Model weights are: " << os.str();
-  }
-  int32 num_effective_nnets = nnet_params_.NumRows();
-  if (num_effective_nnets != num_real_input_nnets_)
-    KALDI_LOG << "Above, only " << num_effective_nnets << " weights were "
-              "printed due to the the --max-effective-inputs option; "
-              "there were " << num_real_input_nnets_ << " actual input nnets. "
-              "Each weight corresponds to a weighted average over a range of "
-              "nnets in the sequence (with triangular bins)";
-}
-
-bool NnetCombiner::SelfTestDerivatives() {
-  int32 num_tests = 2;  // more properly, this is the number of dimensions in a
-                        // single test.
-  double delta = 0.001;
-  int32 dim = ParameterDim();
-
-  Vector<double> params(dim), deriv(dim);
-  Vector<double> predicted_changes(num_tests),
-      observed_changes(num_tests);
-
-  GetInitialParameters(&params);
-  double initial_objf = ComputeObjfAndDerivFromParameters(params,
-                                                             &deriv);
-  for (int32 i = 0; i < num_tests; i++) {
-    Vector<double> new_deriv(dim), offset(dim), new_params(params);
-    offset.SetRandn();
-    new_params.AddVec(delta, offset);
-    double new_objf = ComputeObjfAndDerivFromParameters(new_params,
-                                                           &new_deriv);
-    // for predicted changes, interpolate old and new derivs.
-    predicted_changes(i) =
-        0.5 * VecVec(new_params, deriv) -  0.5 * VecVec(params, deriv) +
-        0.5 * VecVec(new_params, new_deriv) - 0.5 * VecVec(params, new_deriv);
-    observed_changes(i) = new_objf - initial_objf;
-  }
-  double threshold = 0.1;
-  KALDI_LOG << "predicted_changes = " << predicted_changes;
-  KALDI_LOG << "observed_changes = " << observed_changes;
-  if (!ApproxEqual(predicted_changes, observed_changes, threshold)) {
-    KALDI_WARN << "Derivatives self-test failed.";
-    return false;
-  } else {
-    return true;
-  }
-}
-
-
-void NnetCombiner::SelfTestModelDerivatives() {
-  int32 num_tests = 3;  // more properly, this is the number of dimensions in a
-                        // single test.
-  int32 dim = ParameterDim();
-
-  Vector<double> params(dim), deriv(dim);
-  Vector<double> predicted_changes(num_tests),
-      observed_changes(num_tests);
-
-  GetInitialParameters(&params);
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim());
-  Vector<BaseFloat> nnet_params(NnetParameterDim(), kUndefined),
-      nnet_deriv(NnetParameterDim(), kUndefined);
-  GetWeights(params, &weights);
-  GetNormalizedWeights(weights, &normalized_weights);
-  GetNnetParameters(normalized_weights, &nnet_params);
-
-  double initial_objf = ComputeObjfAndDerivFromNnet(nnet_params,
-                                                       &nnet_deriv);
-
-  double delta = 0.002 * std::sqrt(VecVec(nnet_params, nnet_params) /
-                                   NnetParameterDim());
-
-
-  for (int32 i = 0; i < num_tests; i++) {
-    Vector<BaseFloat> new_nnet_deriv(NnetParameterDim()),
-        offset(NnetParameterDim()), new_nnet_params(nnet_params);
-    offset.SetRandn();
-    new_nnet_params.AddVec(delta, offset);
-    double new_objf = ComputeObjfAndDerivFromNnet(new_nnet_params,
-                                                     &new_nnet_deriv);
-    // for predicted changes, interpolate old and new derivs.
-    predicted_changes(i) =
-        0.5 * VecVec(new_nnet_params, nnet_deriv) -
-        0.5 * VecVec(nnet_params, nnet_deriv) +
-        0.5 * VecVec(new_nnet_params, new_nnet_deriv) -
-        0.5 * VecVec(nnet_params, new_nnet_deriv);
-    observed_changes(i) = new_objf - initial_objf;
-  }
-  double threshold = 0.1;
-  KALDI_LOG << "model-derivatives: predicted_changes = " << predicted_changes;
-  KALDI_LOG << "model-derivatives: observed_changes = " << observed_changes;
-  if (!ApproxEqual(predicted_changes, observed_changes, threshold))
-    KALDI_WARN << "Model derivatives self-test failed.";
-}
-
-
-
-
-int32 NnetCombiner::ParameterDim() const {
-  if (config_.separate_weights_per_component)
-    return NumUpdatableComponents() * nnet_params_.NumRows();
-  else
-    return nnet_params_.NumRows();
-}
-
-
-void NnetCombiner::GetInitialParameters(VectorBase<double> *params) const {
-  KALDI_ASSERT(params->Dim() == ParameterDim());
-  params->Set(1.0 / nnet_params_.NumRows());
-  if (config_.enforce_positive_weights) {
-    // we enforce positive weights by treating the params as the log of the
-    // actual weight.
-    params->ApplyLog();
-  }
-}
-
-void NnetCombiner::GetWeights(const VectorBase<double> &params,
-                              VectorBase<double> *weights) const {
-  KALDI_ASSERT(weights->Dim() == WeightDim());
-  if (config_.separate_weights_per_component) {
-    weights->CopyFromVec(params);
-  } else {
-    int32 nc = NumUpdatableComponents();
-    // have one parameter per row of nnet_params_, and need to repeat
-    // the weight for the different components.
-    for (int32 n = 0; n < nnet_params_.NumRows(); n++) {
-      for (int32 c = 0; c < nc; c++)
-        (*weights)(n * nc + c) = params(n);
-    }
-  }
-  // we enforce positive weights by having the weights be the exponential of the
-  // corresponding parameters.
-  if (config_.enforce_positive_weights)
-    weights->ApplyExp();
-}
-
-
-void NnetCombiner::GetParamsDeriv(const VectorBase<double> &weights,
-                                  const VectorBase<double> &weights_deriv,
-                                  VectorBase<double> *param_deriv) {
-  KALDI_ASSERT(weights.Dim() == WeightDim() &&
-               param_deriv->Dim() == ParameterDim());
-  Vector<double> preexp_weights_deriv(weights_deriv);
-  if (config_.enforce_positive_weights) {
-    // to enforce positive weights we first compute weights (call these
-    // preexp_weights) and then take exponential.  Note, d/dx exp(x) = exp(x).
-    // So the derivative w.r.t. the preexp_weights equals the derivative
-    // w.r.t. the weights, times the weights.
-    preexp_weights_deriv.MulElements(weights);
-  }
-  if (config_.separate_weights_per_component) {
-    param_deriv->CopyFromVec(preexp_weights_deriv);
-  } else {
-    int32 nc = NumUpdatableComponents();
-    param_deriv->SetZero();
-    for (int32 n = 0; n < nnet_params_.NumRows(); n++)
-      for (int32 c = 0; c < nc; c++)
-        (*param_deriv)(n) += preexp_weights_deriv(n * nc + c);
-  }
-}
-
-
-double NnetCombiner::GetSumToOnePenalty(
-    const VectorBase<double> &weights,
-    VectorBase<double> *weights_penalty_deriv,
-    bool print_weights) const {
-
-  KALDI_ASSERT(config_.sum_to_one_penalty >= 0.0);
-  double penalty = config_.sum_to_one_penalty;
-  if (penalty == 0.0) {
-    weights_penalty_deriv->SetZero();
-    return 0.0;
-  }
-  double ans = 0.0;
-  int32 num_uc = NumUpdatableComponents(),
-    num_models = nnet_params_.NumRows();
-  Vector<double> tot_weights(num_uc);
-  std::ostringstream tot_weight_info;
-  for (int32 c = 0; c < num_uc; c++) {
-    double this_total_weight = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      double this_weight = weights(index);
-      this_total_weight += this_weight;
-    }
-    tot_weights(c) = this_total_weight;
-    // this_total_weight_deriv is the derivative of the penalty
-    // term w.r.t. this component's total weight.
-    double this_total_weight_deriv;
-    if (config_.enforce_positive_weights) {
-      // if config_.enforce_positive_weights is true, then we choose to
-      // formulate the penalty in a slightly different way.. this solves the
-      // problem that with the formulation in the 'else' below, if for some
-      // reason the total weight is << 1.0, the deriv w.r.t. the actual
-      // parameters gets tiny [because weight = exp(params)].
-      double log_total = log(this_total_weight);
-      ans += -0.5 * penalty * log_total * log_total;
-      double log_total_deriv = -1.0 * penalty * log_total;
-      this_total_weight_deriv = log_total_deriv / this_total_weight;
-    } else {
-      ans += -0.5 * penalty *
-             (this_total_weight - 1.0) * (this_total_weight - 1.0);
-      this_total_weight_deriv = penalty * (1.0 - this_total_weight);
-
-    }
-    if (weights_penalty_deriv != NULL) {
-      KALDI_ASSERT(weights.Dim() == weights_penalty_deriv->Dim());
-      for (int32 m = 0; m < num_models; m++) {
-        int32 index = m * num_uc + c;
-        (*weights_penalty_deriv)(index) = this_total_weight_deriv;
-      }
-    }
-  }
-  if (print_weights) {
-    Vector<BaseFloat> tot_weights_float(tot_weights);
-    KALDI_LOG << "Total weights per component: "
-              << PrintVectorPerUpdatableComponent(nnet_,
-                                                  tot_weights_float);
-  }
-  return ans;
-}
-
-
-void NnetCombiner::GetNnetParameters(const Vector<double> &weights,
-                                     VectorBase<BaseFloat> *nnet_params) const {
-  KALDI_ASSERT(nnet_params->Dim() == nnet_params_.NumCols());
-  nnet_params->SetZero();
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 m = 0; m < num_models; m++) {
-    const SubVector<BaseFloat> src_params(nnet_params_, m);
-    int32 dim_offset = 0;
-    for (int32 c = 0; c < num_uc; c++) {
-      int32 index = m * num_uc + c;
-      BaseFloat weight = weights(index);
-      int32 dim = updatable_component_dims_[c];
-      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
-                                                      dim);
-      SubVector<BaseFloat> dest_component_params(*nnet_params, dim_offset, dim);
-      dest_component_params.AddVec(weight, src_component_params);
-      dim_offset += dim;
-    }
-    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
-  }
-}
-
-// compare GetNnetParameters.
-void NnetCombiner::GetWeightsDeriv(
-    const VectorBase<BaseFloat> &nnet_params_deriv,
-    VectorBase<double> *weights_deriv) {
-  KALDI_ASSERT(nnet_params_deriv.Dim() == nnet_params_.NumCols() &&
-               weights_deriv->Dim() == WeightDim());
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 m = 0; m < num_models; m++) {
-    const SubVector<BaseFloat> src_params(nnet_params_, m);
-    int32 dim_offset = 0;
-    for (int32 c = 0; c < num_uc; c++) {
-      int32 index = m * num_uc + c;
-      int32 dim = updatable_component_dims_[c];
-      const SubVector<BaseFloat> src_component_params(src_params, dim_offset,
-                                                      dim);
-      const SubVector<BaseFloat> component_params_deriv(nnet_params_deriv,
-                                                        dim_offset, dim);
-      (*weights_deriv)(index) = VecVec(src_component_params,
-                                       component_params_deriv);
-      dim_offset += dim;
-    }
-    KALDI_ASSERT(dim_offset == nnet_params_.NumCols());
-  }
-}
-
-double NnetCombiner::ComputeObjfAndDerivFromNnet(
-    VectorBase<BaseFloat> &nnet_params,
-    VectorBase<BaseFloat> *nnet_params_deriv) {
-  BaseFloat sum = nnet_params.Sum();
-  // inf/nan parameters->return -inf objective.
-  if (!(sum == sum && sum - sum == 0))
-    return -std::numeric_limits<double>::infinity();
-  // Set nnet to have these params.
-  UnVectorizeNnet(nnet_params, &nnet_);
-
-  prob_computer_->Reset();
-  std::vector<NnetExample>::const_iterator iter = egs_.begin(),
-                                            end = egs_.end();
-  for (; iter != end; ++iter)
-    prob_computer_->Compute(*iter);
-  double tot_weights,
-    tot_objf = prob_computer_->GetTotalObjective(&tot_weights);
-  KALDI_ASSERT(tot_weights > 0.0);
-  const Nnet &deriv = prob_computer_->GetDeriv();
-  VectorizeNnet(deriv, nnet_params_deriv);
-  // we prefer to deal with normalized objective functions.
-  nnet_params_deriv->Scale(1.0 / tot_weights);
-  return tot_objf / tot_weights;
-}
-
-
-double NnetCombiner::ComputeObjfAndDerivFromParameters(
-    VectorBase<double> &params,
-    VectorBase<double> *params_deriv) {
-  Vector<double> weights(WeightDim()), normalized_weights(WeightDim()),
-      weights_sum_to_one_penalty_deriv(WeightDim()),
-      normalized_weights_deriv(WeightDim()), weights_deriv(WeightDim());
-  Vector<BaseFloat>
-      nnet_params(NnetParameterDim(), kUndefined),
-      nnet_params_deriv(NnetParameterDim(), kUndefined);
-  GetWeights(params, &weights);
-  double ans = GetSumToOnePenalty(weights, &weights_sum_to_one_penalty_deriv);
-  GetNormalizedWeights(weights, &normalized_weights);
-  GetNnetParameters(normalized_weights, &nnet_params);
-  ans += ComputeObjfAndDerivFromNnet(nnet_params, &nnet_params_deriv);
-  if (ans != ans || ans - ans != 0) // NaN or inf
-    return ans;  // No point computing derivative
-  GetWeightsDeriv(nnet_params_deriv, &normalized_weights_deriv);
-  GetUnnormalizedWeightsDeriv(weights, normalized_weights_deriv,
-                              &weights_deriv);
-  weights_deriv.AddVec(1.0, weights_sum_to_one_penalty_deriv);
-  GetParamsDeriv(weights, weights_deriv, params_deriv);
-  return ans;
-}
-
-
-// enforces the constraint that the weights for each component must sum to one,
-// if necessary.
-void NnetCombiner::GetNormalizedWeights(
-    const VectorBase<double> &unnorm_weights,
-    VectorBase<double> *norm_weights) const {
-  if (!config_.enforce_sum_to_one) {
-    norm_weights->CopyFromVec(unnorm_weights);
-    return;
-  }
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 c = 0; c < num_uc; c++) {
-    double sum = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      sum += unnorm_weights(index);
-    }
-    double inv_sum = 1.0 / sum;  // if it's NaN then it's OK, we'll get NaN
-                                    // weights and eventually -inf objective.
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
-    }
-  }
-}
-
-void NnetCombiner::GetUnnormalizedWeightsDeriv(
-    const VectorBase<double> &unnorm_weights,
-    const VectorBase<double> &norm_weights_deriv,
-    VectorBase<double> *unnorm_weights_deriv) {
-  if (!config_.enforce_sum_to_one) {
-    unnorm_weights_deriv->CopyFromVec(norm_weights_deriv);
-    return;
-  }
-  int32 num_uc = NumUpdatableComponents(),
-      num_models = nnet_params_.NumRows();
-  for (int32 c = 0; c < num_uc; c++) {
-    double sum = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      sum += unnorm_weights(index);
-    }
-    double inv_sum = 1.0 / sum;
-    double inv_sum_deriv = 0.0;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      // in the forward direction, we'd do:
-      // (*norm_weights)(index) = unnorm_weights(index) * inv_sum;
-      (*unnorm_weights_deriv)(index) = inv_sum * norm_weights_deriv(index);
-      inv_sum_deriv += norm_weights_deriv(index) * unnorm_weights(index);
-    }
-    // note: d/dx (1/x) = -1/x^2
-    double sum_deriv = -1.0 * inv_sum_deriv * inv_sum * inv_sum;
-    for (int32 m = 0; m < num_models; m++) {
-      int32 index = m * num_uc + c;
-      (*unnorm_weights_deriv)(index) += sum_deriv;
-    }
-  }
-}
-
-
-
-
-} // namespace nnet3
-} // namespace kaldi
diff --git a/src/nnet3/nnet-combine.h b/src/nnet3/nnet-combine.h
deleted file mode 100644
index 5b60d30b8ed..00000000000
--- a/src/nnet3/nnet-combine.h
+++ /dev/null
@@ -1,251 +0,0 @@
-// nnet3/nnet-combine.h
-
-// Copyright 2012-2015  Johns Hopkins University (author: Daniel Povey)
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_NNET3_NNET_COMBINE_H_
-#define KALDI_NNET3_NNET_COMBINE_H_
-
-#include "nnet3/nnet-utils.h"
-#include "nnet3/nnet-compute.h"
-#include "util/parse-options.h"
-#include "itf/options-itf.h"
-#include "nnet3/nnet-diagnostics.h"
-
-
-namespace kaldi {
-namespace nnet3 {
-
-/** Configuration class that controls neural net combination, where we combine a
-    number of neural nets.
-*/
-struct NnetCombineConfig {
-  int32 num_iters; // The dimension of the space we are optimizing in is fairly
-                   // small (equal to the number of components times the number
-                   // of neural nets we were given), so we optimize with BFGS
-                   // (internally the code uses L-BFGS, but we set the the
-                   // number of vectors to be the same as the dimension of the
-                   // space, so it actually is regular BFGS.  The num-iters
-                   // corresponds to the number of function evaluations.
-
-
-  BaseFloat initial_impr;
-  int32 max_effective_inputs;
-  bool test_gradient;
-  bool enforce_positive_weights;
-  bool enforce_sum_to_one;
-  BaseFloat sum_to_one_penalty;
-  bool separate_weights_per_component;
-  NnetCombineConfig(): num_iters(60),
-                       initial_impr(0.01),
-                       max_effective_inputs(15),
-                       test_gradient(false),
-                       enforce_positive_weights(false),
-                       enforce_sum_to_one(false),
-                       sum_to_one_penalty(0.0),
-                       separate_weights_per_component(true) { }
-
-  void Register(OptionsItf *po) {
-    po->Register("num-iters", &num_iters, "Maximum number of function "
-                 "evaluations for BFGS to use when optimizing combination weights");
-    po->Register("max-effective-inputs", &max_effective_inputs, "Limits the number of "
-                 "parameters that have to be learn to be equivalent to the number of "
-                 "parameters we'd have to learn if the number of inputs nnets equalled "
-                 "this number.   Does this by using averages of nnets at close points "
-                 "in the sequence of inputs, as the actual inputs to the computation.");
-    po->Register("initial-impr", &initial_impr, "Amount of objective-function change "
-                 "we aim for on the first iteration (controls the initial step size).");
-    po->Register("test-gradient", &test_gradient, "If true, activate code that "
-                 "tests the gradient is accurate.");
-    po->Register("enforce-positive-weights", &enforce_positive_weights,
-                 "If true, enforce that all weights are positive.");
-    po->Register("enforce-sum-to-one", &enforce_sum_to_one, "If true, enforce that "
-                 "the model weights for each component should sum to one.");
-    po->Register("sum-to-one-penalty", &sum_to_one_penalty, "If >0, a penalty term "
-                 "on the squared difference between sum(weights) for one component,"
-                 " and 1.0. This is like --enforce-sum-to-one, but done in a 'soft' "
-                 "way (e.g. maybe useful with dropout).  We suggest small values "
-                 "like 10e-3 (for regular nnets) or 1.0e-04 (for chain models).");
-    po->Register("separate-weights-per-component", &separate_weights_per_component,
-                 "If true, have a separate weight for each updatable component in "
-                 "the nnet.");
-  }
-};
-
-
-/*
-  You should use this class as follows:
-    - Call the constructor, giving it the egs and the first nnet.
-    - Call AcceptNnet to provide all the other nnets.  (the nnets will
-      be stored in a matrix in CPU memory, to avoid filing up GPU memory).
-    - Call Combine()
-    - Get the resultant nnet with GetNnet().
- */
-class NnetCombiner {
- public:
-  /// Caution: this object retains a const reference to the "egs", so don't
-  /// delete them until it goes out of scope.
-  NnetCombiner(const NnetCombineConfig &config,
-               int32 num_nnets,
-               const std::vector<NnetExample> &egs,
-               const Nnet &first_nnet);
-  /// You should call this function num_nnets-1 times after calling
-  /// the constructor, to provide the remaining nnets.
-  void AcceptNnet(const Nnet &nnet);
-  void Combine();
-  const Nnet &GetNnet() const { return nnet_; }
-
-  ~NnetCombiner() { delete prob_computer_; }
- private:
-  NnetCombineConfig config_;
-
-  const std::vector<NnetExample> &egs_;
-
-  Nnet nnet_;  // The current neural network.
-
-  NnetComputeProb *prob_computer_;
-
-  std::vector<int32> updatable_component_dims_;  // dimension of each updatable
-                                                 // component.
-
-  int32 num_real_input_nnets_;  // number of actual nnet inputs.
-
-  int32 num_nnets_provided_;  // keeps track of the number of calls to AcceptNnet().
-
-  // nnet_params_ are the parameters of the "effective input"
-  // neural nets; they will often be the same as the real inputs,
-  // but if num_real_input_nnets_ > config_.num_effective_nnets, they
-  // will be weighted combinations.
-  Matrix<BaseFloat> nnet_params_;
-
-  // This vector has the same dimension as nnet_params_.NumRows(),
-  // and helps us normalize so each row of nnet_params corresponds to
-  // a weighted average of its inputs (will be all ones if
-  // config_.max_effective_inputs >= the number of nnets provided).
-  Vector<BaseFloat> tot_input_weighting_;
-
-  // returns the parameter dimension, i.e. the dimension of the parameters that
-  // we are optimizing.  This depends on the config, the number of updatable
-  // components and nnet_params_.NumRows(); it will never exceed the number of
-  // updatable components times nnet_params_.NumRows().
-  int32 ParameterDim() const;
-
-  int32 NumUpdatableComponents() const {
-    return updatable_component_dims_.size();
-  }
-  // returns the weight dimension.
-  int32 WeightDim() const {
-    return nnet_params_.NumRows() * NumUpdatableComponents();
-  }
-
-  int32 NnetParameterDim() const { return nnet_params_.NumCols(); }
-
-  // Computes the initial parameters.  The parameters are the underlying thing
-  // that we optimize; their dimension equals ParameterDim().  They are not the same
-  // thing as the nnet parameters.
-  void GetInitialParameters(VectorBase<double> *params) const;
-
-  // Tests that derivatives are accurate.  Prints warning and returns false if not.
-  bool SelfTestDerivatives();
-
-  // Tests that model derivatives are accurate.  Just prints warning if not.
-  void SelfTestModelDerivatives();
-
-
-  // prints the parameters via logging statements.
-  void PrintParams(const VectorBase<double> &params) const;
-
-  // This function computes the objective function (and its derivative, if the objective
-  // function is finite) at the given value of the parameters (the parameters we're optimizing,
-  // i.e. the combination weights; not the nnet parameters.  This function calls most of the
-  // functions below.
-  double ComputeObjfAndDerivFromParameters(
-      VectorBase<double> &params,
-      VectorBase<double> *params_deriv);
-
-
-  // Computes the weights from the parameters in a config-dependent way.  The
-  // weight dimension is always (the number of updatable components times
-  // nnet_params_.NumRows()).
-  void GetWeights(const VectorBase<double> &params,
-                  VectorBase<double> *weights) const;
-
-  // Given the raw weights: if config_.enforce_sum_to_one, then compute weights
-  // with sum-to-one constrint per component included; else just copy input to
-  // output.
-  void GetNormalizedWeights(const VectorBase<double> &unnorm_weights,
-                            VectorBase<double> *norm_weights) const;
-
-  // if config_.sum_to_one_penalty is 0.0, returns 0.0 and sets
-  // weights_penalty_deriv to 0.0; else it computes, for each
-  // updatable component u the total weight w_u, returns the value
-  // -0.5 * config_.sum_to_one_penalty * sum_u (w_u - 1.0)^2;
-  // and sets 'weights_penalty_deriv' to the derivative w.r.t.
-  // the result.
-  // Note: config_.sum_to_one_penalty is exclusive with
-  // config_.enforce_sum_to_one, so there is really no distinction between
-  // normalized and unnormalized weights here (since normalization would be a
-  // no-op).
-  double GetSumToOnePenalty(const VectorBase<double> &weights,
-                            VectorBase<double> *weights_penalty_deriv,
-                            bool print_weights = false) const;
-
-
-  // Computes the nnet-parameter vector from the normalized weights and
-  // nnet_params_, as a vector.  (See the functions Vectorize() and
-  // UnVectorize() for how they relate to the nnet's components' parameters).
-  void GetNnetParameters(const Vector<double> &normalized_weights,
-                         VectorBase<BaseFloat> *nnet_params) const;
-
-  // This function computes the objective function (and its derivative, if the objective
-  // function is finite) at the given value of nnet parameters.  This involves the
-  // nnet computation.
-  double ComputeObjfAndDerivFromNnet(VectorBase<BaseFloat> &nnet_params,
-                                     VectorBase<BaseFloat> *nnet_params_deriv);
-
-  // Given an objective-function derivative with respect to the nnet parameters,
-  // computes the derivative with respect to the (normalized) weights.
-  void GetWeightsDeriv(const VectorBase<BaseFloat> &nnet_params_deriv,
-                       VectorBase<double> *normalized_weights_deriv);
-
-
-  // Computes the derivative w.r.t. the unnormalized weights, by propagating
-  // through the normalization operation.
-  // If config_.enforce_sum_to_one == false, just copies norm_weights_deriv to
-  // unnorm_weights_deriv.
-  void GetUnnormalizedWeightsDeriv(const VectorBase<double> &unnorm_weights,
-                                   const VectorBase<double> &norm_weights_deriv,
-                                   VectorBase<double> *unnorm_weights_deriv);
-
-
-  // Given a derivative w.r.t. the weights, outputs a derivative w.r.t.
-  // the params
-  void GetParamsDeriv(const VectorBase<double> &weights,
-                      const VectorBase<double> &weight_deriv,
-                      VectorBase<double> *param_deriv);
-
-  void ComputeUpdatableComponentDims();
-  void FinishPreprocessingInput();
-
-};
-
-
-
-} // namespace nnet3
-} // namespace kaldi
-
-#endif
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 5d67715a228..d2db4b4df67 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -28,42 +28,49 @@
 namespace kaldi {
 namespace nnet3 {
 
-double ComputeObjf(const std::vector<NnetExample> &egs,
+// Computes the objective of the moving average of nnet on egs. If either of
+// batchnorm/dropout test modes is true, we make a copy of the moving average,
+// set test modes on that and evaluate its objective. Note: the object that
+// prob_computer->nnet_ refers to should be moving_average_nnet.
+double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
+                   const std::vector<NnetExample> &egs,
+                   const Nnet &moving_average_nnet,
                    NnetComputeProb *prob_computer) {
-  prob_computer->Reset();
-  std::vector<NnetExample>::const_iterator iter = egs.begin(),
-                                            end = egs.end();
-  for (; iter != end; ++iter)
-    prob_computer->Compute(*iter);
-  double tot_weights,
-      tot_objf = prob_computer->GetTotalObjective(&tot_weights);
-  KALDI_ASSERT(tot_weights > 0.0);
-  // we prefer to deal with normalized objective functions.
-  return tot_objf / tot_weights;
+  if (batchnorm_test_mode || dropout_test_mode) {
+    Nnet moving_average_nnet_copy(moving_average_nnet);
+    if (batchnorm_test_mode)
+      SetBatchnormTestMode(true, &moving_average_nnet_copy);
+    if (dropout_test_mode)
+      SetDropoutTestMode(true, &moving_average_nnet_copy);
+    NnetComputeProbOptions compute_prob_opts;
+    NnetComputeProb prob_computer_test(compute_prob_opts,
+                                       moving_average_nnet_copy);
+    return ComputeObjf(false, false, egs, moving_average_nnet_copy,
+                       &prob_computer_test);
+  } else {
+    prob_computer->Reset();
+    std::vector<NnetExample>::const_iterator iter = egs.begin(),
+                                              end = egs.end();
+    for (; iter != end; ++iter)
+      prob_computer->Compute(*iter);
+    double tot_weights,
+        tot_objf = prob_computer->GetTotalObjective(&tot_weights);
+    KALDI_ASSERT(tot_weights > 0.0);
+    // inf/nan tot_objf->return -inf objective.
+    if (!(tot_objf == tot_objf && tot_objf - tot_objf == 0))
+      return -std::numeric_limits<double>::infinity();
+    // we prefer to deal with normalized objective functions.
+    return tot_objf / tot_weights;
+  }
 }
 
-// Note: the object that prob_computer.nnet_ refers to should be
-// *moving_average_nnet.
-double UpdateNnetMovingAverageAndComputeObjf(int32 num_models,
-    const std::vector<NnetExample> &egs,
-    const Nnet &nnet, Nnet *moving_average_nnet,
-    NnetComputeProb *prob_computer) {
-  int32 num_params = NumParameters(nnet);
-  KALDI_ASSERT(num_params == NumParameters(*moving_average_nnet));
-  Vector<BaseFloat> nnet_params(num_params, kUndefined),
-      moving_average_nnet_params(num_params, kUndefined);
-  VectorizeNnet(nnet, &nnet_params);
-  VectorizeNnet(*moving_average_nnet, &moving_average_nnet_params);
-  moving_average_nnet_params.Scale((num_models - 1.0) / num_models);
-  moving_average_nnet_params.AddVec(1.0 / num_models,  nnet_params);
-
-  BaseFloat sum = moving_average_nnet_params.Sum();
-  // inf/nan parameters->return -inf objective.
-  if (!(sum == sum && sum - sum == 0))
-    return -std::numeric_limits<double>::infinity();
-
-  UnVectorizeNnet(moving_average_nnet_params, moving_average_nnet);
-  return ComputeObjf(egs, prob_computer);
+// Updates moving average over num_models nnets, given the average over
+// previous (num_models - 1) nnets, and the new nnet.
+void UpdateNnetMovingAverage(int32 num_models,
+    const Nnet &nnet, Nnet *moving_average_nnet) {
+  KALDI_ASSERT(NumParameters(nnet) == NumParameters(*moving_average_nnet));
+  ScaleNnet((num_models - 1.0) / num_models, moving_average_nnet);
+  AddNnet(nnet, 1.0 / num_models, moving_average_nnet);
 }
 
 }
@@ -80,7 +87,7 @@ int main(int argc, char *argv[]) {
         "Using a subset of training or held-out examples, compute the average\n"
         "over the first n nnet3 models where we maxize the objective function\n"
         "for n. Note that the order of models has been reversed before\n"
-        "feeding into this binary. So we are actually combining last n models.\n"
+        "being fed into this binary. So we are actually combining last n models.\n"
         "Inputs and outputs are 'raw' nnets.\n"
         "\n"
         "Usage:  nnet3-combine [options] <nnet-in1> <nnet-in2> ... <nnet-inN> <valid-example-in> <nnet-out>\n"
@@ -89,17 +96,24 @@ int main(int argc, char *argv[]) {
         " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n";
 
     bool binary_write = true;
+    int32 max_objective_evaluations = 30;
     bool batchnorm_test_mode = false,
         dropout_test_mode = true;
     std::string use_gpu = "yes";
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
+    po.Register("max-objective-evaluations", &max_objective_evaluations, "Max "
+                "number of objective evaluations in order to figure out the "
+                "best number of models to combine. It helps to speedup if "
+                "the number of models provided to this binary is quite large "
+                "(e.g. several hundred)."); 
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
-                "If true, set test-mode to true on any BatchNormComponents.");
+                "If true, set test-mode to true on any BatchNormComponents "
+                "while evaluating objectives.");
     po.Register("dropout-test-mode", &dropout_test_mode,
                 "If true, set test-mode to true on any DropoutComponents and "
-                "DropoutMaskComponents.");
+                "DropoutMaskComponents while evaluating objectives.");
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
 
@@ -123,13 +137,7 @@ int main(int argc, char *argv[]) {
     ReadKaldiObject(nnet_rxfilename, &nnet);
     Nnet moving_average_nnet(nnet), best_nnet(nnet);
     NnetComputeProbOptions compute_prob_opts;
-    NnetComputeProb *prob_computer = new NnetComputeProb(compute_prob_opts,
-                                                         moving_average_nnet);
-
-    if (batchnorm_test_mode)
-      SetBatchnormTestMode(true, &nnet);
-    if (dropout_test_mode)
-      SetDropoutTestMode(true, &nnet);
+    NnetComputeProb prob_computer(compute_prob_opts, moving_average_nnet);
 
     std::vector<NnetExample> egs;
     egs.reserve(10000);  // reserve a lot of space to minimize the chance of
@@ -144,25 +152,35 @@ int main(int argc, char *argv[]) {
       KALDI_ASSERT(!egs.empty());
     }
 
-    int32 best_n = 1;
-    double best_objf = ComputeObjf(egs, prob_computer);
+    // first evaluates the objective using the last model.
+    int32 best_num_to_combine = 1;
+    double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+        egs, moving_average_nnet, &prob_computer);
     KALDI_LOG << "objective function using the last model is " << best_objf;
 
-    int32 num_inputs = po.NumArgs() - 2;
-    if (num_inputs > 1) {
-      for (int32 n = 1; n < num_inputs; n++) {
+    int32 num_nnets = po.NumArgs() - 2;
+    // then each time before we re-evaluate the objective function, we will add
+    // num_to_add models to the moving average.
+    int32 num_to_add = (num_nnets + max_objective_evaluations - 1) /
+                       max_objective_evaluations;
+    if (num_nnets > 1) {
+      for (int32 n = 1; n < num_nnets; n++) {
         ReadKaldiObject(po.GetArg(1 + n), &nnet);
-        double objf = UpdateNnetMovingAverageAndComputeObjf(n + 1, egs, nnet,
-            &moving_average_nnet, prob_computer);
-        KALDI_LOG << "Combining last " << n + 1
-                  << " models, objective function is " << objf;
-        if (objf > best_objf) {
-          best_objf = objf;
-          best_nnet = moving_average_nnet;
-          best_n = n + 1;
+        // updates the moving average
+        UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+        if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
+          double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+              egs, moving_average_nnet, &prob_computer);
+          KALDI_LOG << "Combining last " << n + 1
+                    << " models, objective function is " << objf;
+          if (objf > best_objf) {
+            best_objf = objf;
+            best_nnet = moving_average_nnet;
+            best_num_to_combine = n + 1;
+          }
         }
       }
-      KALDI_LOG << "Using the model averaged over last " << best_n
+      KALDI_LOG << "Using the model averaged over last " << best_num_to_combine
                 << " models, objective function is " << best_objf;
 
 #if HAVE_CUDA==1

From 88e2914be72fb0b49ecd96ea258cfe5f77b5fdaa Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Mon, 11 Dec 2017 21:07:51 -0500
Subject: [PATCH 011/184] python-level changes, added more documentations.

---
 .../nnet3/train/chain_objf/acoustic_model.py    | 17 ++++-------------
 egs/wsj/s5/steps/libs/nnet3/train/common.py     | 12 +++++++++++-
 .../libs/nnet3/train/frame_level_objf/common.py | 11 ++++-------
 egs/wsj/s5/steps/nnet3/chain/train.py           |  2 +-
 egs/wsj/s5/steps/nnet3/train_dnn.py             |  2 +-
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py         |  2 +-
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py         |  2 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py             |  2 +-
 src/chainbin/nnet3-chain-combine.cc             | 12 +++++++-----
 src/nnet3bin/nnet3-combine.cc                   | 12 +++++++-----
 10 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 02a3b4c75d5..5b640510ea1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -492,7 +492,7 @@ def compute_progress(dir, iter, run_opts):
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, leaky_hmm_coefficient, l2_regularize,
                    xent_regularize, run_opts,
-                   sum_to_one_penalty=0.0):
+                   max_objective_evaluations=30):
     """ Function to do model combination
 
     In the nnet3 setup, the logic
@@ -505,9 +505,6 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     models_to_combine.add(num_iters)
 
-    # TODO: if it turns out the sum-to-one-penalty code is not useful,
-    # remove support for it.
-
     for iter in sorted(models_to_combine):
         model_file = '{0}/{1}.mdl'.format(dir, iter)
         if os.path.exists(model_file):
@@ -528,12 +525,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
 
     common_lib.execute_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-chain-combine --num-iters={opt_iters} \
+                nnet3-chain-combine \
+                --max-objective-evaluations={max_objective_evaluations} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --separate-weights-per-component={separate_weights} \
-                --enforce-sum-to-one={hard_enforce} \
-                --sum-to-one-penalty={penalty} \
-                --enforce-positive-weights=true \
                 --verbose=3 {dir}/den.fst {raw_models} \
                 "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \
                     nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
@@ -542,12 +536,9 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
-                    opt_iters=(20 if sum_to_one_penalty <= 0 else 80),
-                    separate_weights=(sum_to_one_penalty > 0),
+                    max_objective_evaluations=max_objective_evaluations,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
-                    hard_enforce=(sum_to_one_penalty <= 0),
-                    penalty=sum_to_one_penalty,
                     num_chunk_per_mb=num_chunk_per_minibatch_str,
                     num_iters=num_iters,
                     egs_dir=egs_dir))
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index b3b443ceb4c..8168d2f94a6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -852,6 +852,16 @@ def __init__(self,
                                  the final model combination stage.  These
                                  models will themselves be averages of
                                  iteration-number ranges""")
+        self.parser.add_argument("--trainer.optimization.max-objective-evaluations",
+                                 "--trainer.max-objective-evaluations",
+                                 type=int, dest='max_objective_evaluations',
+                                 default=30,
+                                 help="""The maximum number of objective
+                                 evaluations in order to figure out the
+                                 best number of models to combine. It helps to
+                                 speedup if the number of models provided to the
+                                 model combination binary is quite large (e.g.
+                                 several hundred).""")
         self.parser.add_argument("--trainer.optimization.do-final-combination",
                                  dest='do_final_combination', type=str,
                                  action=common_lib.StrToBoolAction,
@@ -863,7 +873,7 @@ def __init__(self,
                                  type=float, dest='combine_sum_to_one_penalty', default=0.0,
                                  help="""If > 0, activates 'soft' enforcement of the
                                  sum-to-one penalty in combination (may be helpful
-                                 if using dropout).  E.g. 1.0e-03.""")
+                                 if using dropout).  E.g. 1.0e-03. It is deprecated.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index f8a69c5ad84..46eec2e3b87 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -452,7 +452,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    minibatch_size_str,
                    run_opts,
                    chunk_width=None, get_raw_nnet_from_am=True,
-                   sum_to_one_penalty=0.0,
+                   max_objective_evaluations=30,
                    use_multitask_egs=False,
                    compute_per_dim_accuracy=False):
     """ Function to do model combination
@@ -501,10 +501,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                              use_multitask_egs=use_multitask_egs)
     common_lib.execute_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-combine --num-iters=80 \
-                --enforce-sum-to-one={hard_enforce} \
-                --sum-to-one-penalty={penalty} \
-                --enforce-positive-weights=true \
+                nnet3-combine \
+                --max-objective-evaluations={max_objective_evaluations} \
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                     {egs_rspecifier} ark:- | \
@@ -513,9 +511,8 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
+                   max_objective_evaluations=max_objective_evaluations,
                    egs_rspecifier=egs_rspecifier,
-                   hard_enforce=(sum_to_one_penalty <= 0),
-                   penalty=sum_to_one_penalty,
                    mbsize=minibatch_size_str,
                    out_model=out_model,
                    multitask_egs_opts=multitask_egs_opts))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index d23c379e104..b62f5510e3c 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -554,7 +554,7 @@ def train(args, run_opts):
                 l2_regularize=args.l2_regularize,
                 xent_regularize=args.xent_regularize,
                 run_opts=run_opts,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
         else:
             logger.info("Copying the last-numbered model to final.mdl")
             common_lib.force_symlink("{0}.mdl".format(num_iters),
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 87a1fd5afed..073ad3e7d7a 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -364,7 +364,7 @@ def train(args, run_opts):
                 models_to_combine=models_to_combine,
                 egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
     
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 38396f0b4e7..2d092ceebc7 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -398,7 +398,7 @@ def train(args, run_opts):
                 models_to_combine=models_to_combine, egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                 get_raw_nnet_from_am=False,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty,
+                max_objective_evaluations=args.max_objective_evaluations,
                 use_multitask_egs=use_multitask_egs)
         else:
             common_lib.force_symlink("{0}.raw".format(num_iters),
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index c9ffcf7ff2c..b51632e7d2c 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -475,7 +475,7 @@ def train(args, run_opts):
                 run_opts=run_opts, chunk_width=args.chunk_width,
                 get_raw_nnet_from_am=False,
                 compute_per_dim_accuracy=args.compute_per_dim_accuracy,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty)
+                max_objective_evaluations=args.max_objective_evaluations)
         else:
             common_lib.force_symlink("{0}.raw".format(num_iters),
                                      "{0}/final.raw".format(args.dir))
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index e6f81b03c3b..005e751cae0 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -451,7 +451,7 @@ def train(args, run_opts):
                 run_opts=run_opts,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 chunk_width=args.chunk_width,
-                sum_to_one_penalty=args.combine_sum_to_one_penalty,
+                max_objective_evaluations=args.max_objective_evaluations,
                 compute_per_dim_accuracy=args.compute_per_dim_accuracy)
 
     if args.stage <= num_iters + 1:
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 520575e1d88..80cf72e2da3 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -111,11 +111,11 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("max-objective-evaluations", &max_objective_evaluations, "Max "
-                "number of objective evaluations in order to figure out the "
-                "best number of models to combine. It helps to speedup if "
-                "the number of models provided to this binary is quite large "
-                "(e.g. several hundred)."); 
+    po.Register("max-objective-evaluations", &max_objective_evaluations, "The "
+                "maximum number of objective evaluations in order to figure "
+                "out the best number of models to combine. It helps to speedup "
+                "if the number of models provided to this binary is quite "
+                "large (e.g. several hundred)."); 
     po.Register("use-gpu", &use_gpu,
                 "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
@@ -184,6 +184,8 @@ int main(int argc, char *argv[]) {
       ReadKaldiObject(this_nnet_rxfilename, &nnet);
       // updates the moving average
       UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+      // evaluates the objective everytime after adding num_to_add model or
+      // all the models to the moving average.
       if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
         double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
             egs, moving_average_nnet, chain_config, den_fst, &prob_computer);
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index d2db4b4df67..a38eb3eeddd 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -103,11 +103,11 @@ int main(int argc, char *argv[]) {
 
     ParseOptions po(usage);
     po.Register("binary", &binary_write, "Write output in binary mode");
-    po.Register("max-objective-evaluations", &max_objective_evaluations, "Max "
-                "number of objective evaluations in order to figure out the "
-                "best number of models to combine. It helps to speedup if "
-                "the number of models provided to this binary is quite large "
-                "(e.g. several hundred)."); 
+    po.Register("max-objective-evaluations", &max_objective_evaluations, "The "
+                "maximum number of objective evaluations in order to figure "
+                "out the best number of models to combine. It helps to speedup "
+                "if the number of models provided to this binary is quite "
+                "large (e.g. several hundred)."); 
     po.Register("batchnorm-test-mode", &batchnorm_test_mode,
                 "If true, set test-mode to true on any BatchNormComponents "
                 "while evaluating objectives.");
@@ -168,6 +168,8 @@ int main(int argc, char *argv[]) {
         ReadKaldiObject(po.GetArg(1 + n), &nnet);
         // updates the moving average
         UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+        // evaluates the objective everytime after adding num_to_add model or
+        // all the models to the moving average.
         if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
           double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
               egs, moving_average_nnet, &prob_computer);

From 49827d473ba8e84f2ae9e34c29b92de80ab18bbc Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 12 Dec 2017 00:25:03 -0500
Subject: [PATCH 012/184] fix Makefile

---
 src/nnet3/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 3236c52d60f..8ddba56b0e0 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -22,9 +22,9 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-example.o nnet-nnet.o nnet-compile-utils.o \
   nnet-utils.o nnet-compute.o nnet-test-utils.o nnet-analyze.o \
   nnet-example-utils.o nnet-training.o \
-  nnet-diagnostics.o nnet-combine.o nnet-am-decodable-simple.o \
+  nnet-diagnostics.o nnet-am-decodable-simple.o \
   nnet-optimize-utils.o nnet-chain-example.o \
-  nnet-chain-training.o nnet-chain-diagnostics.o nnet-chain-combine.o \
+  nnet-chain-training.o nnet-chain-diagnostics.o \
   discriminative-supervision.o nnet-discriminative-example.o \
   nnet-discriminative-diagnostics.o \
   discriminative-training.o nnet-discriminative-training.o \

From b2967572cc59089e15f26021f308feca991c7a3c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 12 Dec 2017 17:46:19 -0500
Subject: [PATCH 013/184] [src,scripts] Changing which model stats are
 accumulated on (will affect memory-norm).  Other small changes to
 MemoryNormComponent, will rework most of this.  Adding script changes for
 memory-norm.

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  7 ++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |  1 +
 src/nnet3/nnet-chain-training.cc              | 10 ++++-
 src/nnet3/nnet-compute.cc                     | 38 +++++++++++++------
 src/nnet3/nnet-compute.h                      | 32 ++++++++++++++--
 src/nnet3/nnet-normalize-component.cc         | 31 +++++++++++----
 src/nnet3/nnet-normalize-component.h          |  3 +-
 src/nnet3/nnet-simple-component.cc            |  6 +++
 src/nnet3/nnet-training.cc                    | 10 ++++-
 9 files changed, 110 insertions(+), 28 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 05ae5bcdc18..483883fdee4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -802,6 +802,13 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim,
                                   target_rms))
 
+            elif nonlinearity == 'memnorm':
+                line = ('component name={0}.{1}'
+                        ' type=MemoryNormComponent dim={2}'
+                        ' target-rms={3} include-indirect-derivative=false '
+                        ''.format(self.name, nonlinearity, output_dim,
+                                  target_rms))
+
             elif nonlinearity == 'so':
                 line = ('component name={0}.{1}'
                         ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 '
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index c1cad89824c..127ea816a39 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -25,6 +25,7 @@
         'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
         'relu-dropout-layer': xlayers.XconfigBasicLayer,
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
+        'relu-memnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 780a7115a8a..de351e6c543 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -94,8 +94,11 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(nnet_config.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
@@ -130,8 +133,11 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
                                                const NnetComputation &computation,
                                                bool is_backstitch_step1) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(nnet_config.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 87fa62c6112..31d60e1ce4a 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -30,22 +30,37 @@ NnetComputer::NnetComputer(const NnetComputeOptions &options,
                            const Nnet &nnet,
                            Nnet *nnet_to_update):
     options_(options), computation_(computation), nnet_(nnet),
-    program_counter_(0), nnet_to_update_(nnet_to_update) {
-  KALDI_ASSERT(computation.indexes_cuda.size() == computation.indexes.size() &&
- computation.indexes_ranges_cuda.size() == computation.indexes_ranges.size() &&
+    program_counter_(0), nnet_to_store_stats_(nnet_to_update),
+    nnet_to_update_(nnet_to_update) {
+  Init();
+}
+
+NnetComputer::NnetComputer(const NnetComputeOptions &options,
+                           const NnetComputation &computation,
+                           Nnet *nnet,
+                           Nnet *nnet_to_update):
+    options_(options), computation_(computation), nnet_(*nnet),
+    program_counter_(0), nnet_to_store_stats_(nnet),
+    nnet_to_update_(nnet_to_update) {
+  Init();
+}
+
+void NnetComputer::Init() {
+  KALDI_ASSERT(computation_.indexes_cuda.size() == computation_.indexes.size() &&
+ computation_.indexes_ranges_cuda.size() == computation_.indexes_ranges.size() &&
                "You must call NnetComputation::ComputeCudaIndexes() before "
                "executing the computation.");
-  matrices_.resize(computation.matrices.size());
+  matrices_.resize(computation_.matrices.size());
   debug_ = (options_.debug || GetVerboseLevel() >= 5);
   if (debug_) {
     ComputationVariables variables;
-    variables.Init(computation);
-    ComputeCommandAttributes(nnet, computation, variables,
+    variables.Init(computation_);
+    ComputeCommandAttributes(nnet_, computation_, variables,
                              &command_attributes_);
     std::string preamble;
-    computation.GetCommandStrings(nnet, &preamble, &command_strings_);
+    computation_.GetCommandStrings(nnet_, &preamble, &command_strings_);
     KALDI_LOG << preamble;
-    computation.GetSubmatrixStrings(nnet, &submatrix_strings_);
+    computation_.GetSubmatrixStrings(nnet_, &submatrix_strings_);
   }
 }
 
@@ -177,6 +192,7 @@ NnetComputer::NnetComputer(const NnetComputer &other):
     nnet_(other.nnet_),
     program_counter_(other.program_counter_),
     pending_commands_(other.pending_commands_),
+    nnet_to_store_stats_(other.nnet_to_store_stats_),
     nnet_to_update_(other.nnet_to_update_),
     debug_(other.debug_),
     command_attributes_(other.command_attributes_),
@@ -226,14 +242,14 @@ void NnetComputer::ExecuteCommand() {
         CuSubMatrix<BaseFloat> output(GetSubMatrix(c.arg4));
         void *memo = component->Propagate(indexes, input, &output);
         if (c.arg6) {  // need to store stats.
-          KALDI_ASSERT(nnet_to_update_ != NULL);
-          Component *upd_component = nnet_to_update_->GetComponent(c.arg1);
+          KALDI_ASSERT(nnet_to_store_stats_ != NULL);
+          Component *stats_component = nnet_to_store_stats_->GetComponent(c.arg1);
           bool was_in_place = (c.arg3 == c.arg4);
           // if propagate was in-place, provide empty matrix and not 'input', as
           // input is no longer valid.
           const CuSubMatrix<BaseFloat> maybe_input(
               GetSubMatrix(was_in_place ? 0 : c.arg3));
-          upd_component->StoreStats(maybe_input, output, memo);
+          stats_component->StoreStats(maybe_input, output, memo);
         }
         SaveMemo(c.arg5, *component, memo);
         break;
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index e16cbfbb393..869dd107bf6 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -62,15 +62,29 @@ class NnetComputer {
   /// model update or model-derivative computation.
   /// You must call computation.ComputeCudaIndexes()  before calling
   /// this function.
+  ///
+  /// Caution: there is another constructor that takes a pointer for
+  /// 'nnet', be careful not to mix these up.
   NnetComputer(const NnetComputeOptions &options,
                const NnetComputation &computation,
                const Nnet &nnet,
                Nnet *nnet_to_update);
 
-  /// Copy constructor.  May not be used if memos are involved (memos are only
-  /// possible if backprop will take place, and in these situations you won't
-  /// normally be wanting to use the copy constructor anyway; the copy
-  /// constructor is more useful for things like RNNLM lattice rescoring).
+  /// This version of the constructor accepts a pointer to 'nnet' instead
+  /// of a const reference.  The difference is that this version will,
+  /// for storing statistics (the StoreStats() function of class Component),
+  /// use 'nnet' instead of 'nnet_to_update' (if specified).
+  NnetComputer(const NnetComputeOptions &options,
+               const NnetComputation &computation,
+               Nnet *nnet,
+               Nnet *nnet_to_update);
+
+
+  /// Copy constructor.  May not be used if memos are stored with this object
+  /// (which is only a possibility if backprop will take place, and in these
+  /// situations you won't normally be wanting to use the copy constructor
+  /// anyway; the copy constructor is more useful for things like RNNLM lattice
+  /// rescoring).
   NnetComputer(const NnetComputer &other);
 
   /// e.g. AcceptInput ("input", &input_mat), or for derivatives w.r.t. the
@@ -112,9 +126,12 @@ class NnetComputer {
 
 
  private:
+  void Init(); // called from constructors.
+
   const NnetComputeOptions &options_;
   const NnetComputation &computation_;
   const Nnet &nnet_;
+
   int32 program_counter_;  // command index to execute next.
   // To deal with inputs and outputs that are not provided/taken by the user in
   // the same order as listed in the computation, pending_commands_ contains a
@@ -122,6 +139,13 @@ class NnetComputer {
   // executed.
   std::vector<int32> pending_commands_;
 
+  // A pointer to the copy of the nnet which we'll be using for stats
+  // accumulation (the StoreStats() function).  May be NULL or the same
+  // as nnet_ or nnet_to_update_.
+  Nnet *nnet_to_store_stats_;
+  // A pointer to the copy of the nnet which we'll be updating the parameters
+  // of (nnet_to_update in the backprop function).  May be NULL and usually
+  // will not be the same as nnet_.
   Nnet *nnet_to_update_;
   bool debug_;
   // command_attributes_ is only used if debug_=true.
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 7eca8594748..596ab48b9e5 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -545,7 +545,6 @@ void BatchNormComponent::Backprop(
     // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2".
     in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0);
     // At this point, in_deriv contains  x'(i) = x_deriv_base(i) + alpha y(i).
-
   } else {
     KALDI_ASSERT(offset_.Dim() == block_dim_);
     // the next call does no work if they point to the same memory.
@@ -867,18 +866,27 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
   // From this point, we can assume that the num-cols of 'in' and 'out'
   // equals block_dim_.
-  Memo *ans = NULL;
-  if (!test_mode_)
-    ans = GetMemo(in);
+  Memo *memo = NULL;
+  if (!test_mode_) {
+    memo = GetMemo(in);
+    if (false) { // temporary.
+      MemoryNormComponent *temp = new MemoryNormComponent(*this);
+      temp->StoreStats(in, *out, memo);
+      Memo *new_memo = temp->GetMemo(in);
+      delete memo;
+      memo = new_memo;
+      delete temp;
+    }
+  }
 
-  if (test_mode_ || stats_count_ > 0.0) {
+  if (test_mode_) {
     CuSubVector<BaseFloat> x_mean(data_, 0), scale(data_, 4);
     out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
   } else {
-    CuSubVector<BaseFloat> x_sum(memo->data, 0),
+    CuSubVector<BaseFloat> x_mean(memo->data, 5),
         scale(memo->data, 2);
-    out->AddVecToRows(-1.0 / memo->num_frames, x_sum);
+    out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
   }
   return memo;
@@ -891,7 +899,7 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
   Memo *memo = new Memo;
   int32 num_frames = in.NumRows();
   memo->num_frames = num_frames;
-  memo->data.Resize(5, block_dim_);
+  memo->data.Resize(6, block_dim_);
   CuSubVector<BaseFloat> x_sum(memo->data, 0),
       x_sumsq(memo->data, 1);
   x_sum.AddRowSumMat(1.0, in, 0.0);
@@ -905,6 +913,8 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
       // just copy over the scale.  x_deriv and scale_deriv remain zero.
       memo->data.Row(2).CopyFromVec(data_.Row(4));
     }
+    // get 'x_mean'
+    memo->data.Row(5).CopyFromVec(data_.Row(0));
   } else {
     // We should only reach this point on when processing the first
     // minibatch of each training job.
@@ -926,6 +936,11 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
     // At this point 'scale' is the variance plus epsilon.
     scale.ApplyPow(-0.5);
     // OK, now 'scale' is the actual scale: the inverse standard deviation.
+
+    // get 'x_mean'
+    CuSubVector<BaseFloat> x_mean(memo->data, 5);
+    x_mean.CopyFromVec(x_sum);
+    x_mean.Scale(1.0 / num_frames);
   }
   return memo;
 }
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 68506174eb7..bc80421700a 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -403,7 +403,7 @@ class MemoryNormComponent: public Component {
     // The number of frames (after any reshaping; so in general it will
     // be the original NumRows() of the matrix, times dim_ / block_dim_).
     int32 num_frames;
-    // 'data' is of dimension 5 by block_dim_.
+    // 'data' is of dimension 6 by block_dim_.
     // Row 0, which we'll call 'x_sum', is the sum of the rows of the
     //  input data.
     // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the
@@ -423,6 +423,7 @@ class MemoryNormComponent: public Component {
     //         instead of the possibly-updated values that might exist when
     //         Backprop() is called.  It's actually not clear whether this is
     //         necessary.
+    //  Row 5 ('x_mean') is a copy of the data mean the data wasnormalized with.
     CuMatrix<BaseFloat> data;
 
     // This is set to true if we have the 'indirect' terms in the derivative,
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e76f7cae2a7..4d29ba3a070 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -4895,6 +4895,12 @@ void CompositeComponent::Init(const std::vector<Component*> &components,
   max_rows_process_ = max_rows_process;
 
   for (size_t i = 0; i < components_.size(); i++) {
+    if (components_[i]->Type() == "MemoryNormComponent") {
+      // This is out of concerns about the fact that the stats accumulation
+      // is done in the backprop, not in the forward propagation.
+      KALDI_ERR << "MemoryNormComponent cannot currently exist inside "
+          "CompositeComponent";
+    }
     // make sure all constituent components are simple.
     KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent);
     if (i > 0) {
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 30cd47b3eb2..c606db034fb 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -87,8 +87,11 @@ void NnetTrainer::Train(const NnetExample &eg) {
 
 void NnetTrainer::TrainInternal(const NnetExample &eg,
                                 const NnetComputation &computation) {
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(config_.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
   computer.Run();
@@ -121,8 +124,11 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
 void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
                                           const NnetComputation &computation,
                                           bool is_backstitch_step1) {
+  // note: because we give the 1st arg (nnet_) as a pointer to the
+  // constructor of 'computer', it will use that copy of the nnet to
+  // store stats.  This is mainly important for memory-norm.
   NnetComputer computer(config_.compute_config, computation,
-                        *nnet_, delta_nnet_);
+                        nnet_, delta_nnet_);
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.io);
   computer.Run();

From 8a839ef5340cc1e79dab038987778912a7a793e8 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 12 Dec 2017 20:26:34 -0500
Subject: [PATCH 014/184] fix

---
 egs/wsj/s5/steps/info/chain_dir_info.pl     |  3 +
 egs/wsj/s5/steps/info/nnet3_dir_info.pl     |  3 +
 egs/wsj/s5/steps/libs/nnet3/train/common.py |  3 +-
 src/chainbin/nnet3-chain-combine.cc         | 13 +++--
 src/nnet3bin/nnet3-combine.cc               | 63 ++++++++++-----------
 5 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index b0adb7e498c..d0fac5292c6 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -137,6 +137,9 @@ sub get_combine_info {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
         return sprintf(" combine=%.3f->%.3f", $1, $2);
+      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
+        close(F);
+        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); 
       }
     }
   }
diff --git a/egs/wsj/s5/steps/info/nnet3_dir_info.pl b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
index 06d07a63755..4b0e774a592 100755
--- a/egs/wsj/s5/steps/info/nnet3_dir_info.pl
+++ b/egs/wsj/s5/steps/info/nnet3_dir_info.pl
@@ -137,6 +137,9 @@ sub get_combine_info {
       if (m/Combining nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
         return sprintf(" combine=%.2f->%.2f", $1, $2);
+      } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
+        close(F);
+        return sprintf(" combine=%.2f->%.2f (over %d)", $2, $3, $1); 
       }
     }
   }
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 8168d2f94a6..7312cc09fae 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -873,7 +873,8 @@ def __init__(self,
                                  type=float, dest='combine_sum_to_one_penalty', default=0.0,
                                  help="""If > 0, activates 'soft' enforcement of the
                                  sum-to-one penalty in combination (may be helpful
-                                 if using dropout).  E.g. 1.0e-03. It is deprecated.""")
+                                 if using dropout).  E.g. 1.0e-03. This option is
+                                 deprecated and does nothing.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 80cf72e2da3..ac1b40d29db 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -170,9 +170,11 @@ int main(int argc, char *argv[]) {
 
     // first evaluates the objective using the last model.
     int32 best_num_to_combine = 1;
-    double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
-        egs, moving_average_nnet, chain_config, den_fst, &prob_computer);
-    KALDI_LOG << "objective function using the last model is " << best_objf;
+    double
+        init_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, chain_config, den_fst, &prob_computer),
+        best_objf = init_objf;
+    KALDI_LOG << "objective function using the last model is " << init_objf;
 
     int32 num_nnets = po.NumArgs() - 3;
     // then each time before we re-evaluate the objective function, we will add
@@ -198,8 +200,9 @@ int main(int argc, char *argv[]) {
         }
       }
     }
-    KALDI_LOG << "Using the model averaged over last " << best_num_to_combine
-              << " models, objective function is " << best_objf;
+    KALDI_LOG << "Combining " << best_num_to_combine
+              << " nnets, objective function changed from " << init_objf
+              << " to " << best_objf;
 
     if (HasBatchnorm(nnet))
       RecomputeStats(egs, chain_config, den_fst, &best_nnet);
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index a38eb3eeddd..220dd663e30 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -90,7 +90,7 @@ int main(int argc, char *argv[]) {
         "being fed into this binary. So we are actually combining last n models.\n"
         "Inputs and outputs are 'raw' nnets.\n"
         "\n"
-        "Usage:  nnet3-combine [options] <nnet-in1> <nnet-in2> ... <nnet-inN> <valid-example-in> <nnet-out>\n"
+        "Usage:  nnet3-combine [options] <nnet-in1> <nnet-in2> ... <nnet-inN> <valid-examples-in> <nnet-out>\n"
         "\n"
         "e.g.:\n"
         " nnet3-combine 1.1.raw 1.2.raw 1.3.raw ark:valid.egs 2.raw\n";
@@ -154,50 +154,47 @@ int main(int argc, char *argv[]) {
 
     // first evaluates the objective using the last model.
     int32 best_num_to_combine = 1;
-    double best_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
-        egs, moving_average_nnet, &prob_computer);
-    KALDI_LOG << "objective function using the last model is " << best_objf;
+    double
+        init_objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, &prob_computer),
+        best_objf = init_objf;
+    KALDI_LOG << "objective function using the last model is " << init_objf;
 
     int32 num_nnets = po.NumArgs() - 2;
     // then each time before we re-evaluate the objective function, we will add
     // num_to_add models to the moving average.
     int32 num_to_add = (num_nnets + max_objective_evaluations - 1) /
                        max_objective_evaluations;
-    if (num_nnets > 1) {
-      for (int32 n = 1; n < num_nnets; n++) {
-        ReadKaldiObject(po.GetArg(1 + n), &nnet);
-        // updates the moving average
-        UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
-        // evaluates the objective everytime after adding num_to_add model or
-        // all the models to the moving average.
-        if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
-          double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
-              egs, moving_average_nnet, &prob_computer);
-          KALDI_LOG << "Combining last " << n + 1
-                    << " models, objective function is " << objf;
-          if (objf > best_objf) {
-            best_objf = objf;
-            best_nnet = moving_average_nnet;
-            best_num_to_combine = n + 1;
-          }
+    for (int32 n = 1; n < num_nnets; n++) {
+      ReadKaldiObject(po.GetArg(1 + n), &nnet);
+      // updates the moving average
+      UpdateNnetMovingAverage(n + 1, nnet, &moving_average_nnet);
+      // evaluates the objective everytime after adding num_to_add model or
+      // all the models to the moving average.
+      if ((n - 1) % num_to_add == num_to_add - 1 || n == num_nnets - 1) {
+        double objf = ComputeObjf(batchnorm_test_mode, dropout_test_mode,
+            egs, moving_average_nnet, &prob_computer);
+        KALDI_LOG << "Combining last " << n + 1
+                  << " models, objective function is " << objf;
+        if (objf > best_objf) {
+          best_objf = objf;
+          best_nnet = moving_average_nnet;
+          best_num_to_combine = n + 1;
         }
       }
-      KALDI_LOG << "Using the model averaged over last " << best_num_to_combine
-                << " models, objective function is " << best_objf;
+    }
+    KALDI_LOG << "Combining " << best_num_to_combine
+              << " nnets, objective function changed from " << init_objf
+              << " to " << best_objf;
+
+    if (HasBatchnorm(nnet))
+      RecomputeStats(egs, &best_nnet);
 
 #if HAVE_CUDA==1
       CuDevice::Instantiate().PrintProfile();
 #endif
-      if (HasBatchnorm(nnet))
-        RecomputeStats(egs, &best_nnet);
-      WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
-    } else {
-      KALDI_LOG << "Copying the single input model directly to the output, "
-                << "without any combination.";
-      if (HasBatchnorm(nnet))
-        RecomputeStats(egs, &nnet);
-      WriteKaldiObject(nnet, nnet_wxfilename, binary_write);
-    }
+
+    WriteKaldiObject(best_nnet, nnet_wxfilename, binary_write);
     KALDI_LOG << "Finished combining neural nets, wrote model to "
               << nnet_wxfilename;
   } catch(const std::exception &e) {

From 4e8a53ae5e21ddd7d7a9a51bf936ac80170e32c1 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 12 Dec 2017 21:54:31 -0500
Subject: [PATCH 015/184] docs fixes

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py |  5 +----
 src/chainbin/nnet3-chain-combine.cc         | 21 +++++++++----------
 src/nnet3bin/nnet3-combine.cc               | 23 +++++++++------------
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 7312cc09fae..2b4fdd92cec 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -871,10 +871,7 @@ def __init__(self,
                                  last-numbered model as the final.mdl).""")
         self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty",
                                  type=float, dest='combine_sum_to_one_penalty', default=0.0,
-                                 help="""If > 0, activates 'soft' enforcement of the
-                                 sum-to-one penalty in combination (may be helpful
-                                 if using dropout).  E.g. 1.0e-03. This option is
-                                 deprecated and does nothing.""")
+                                 help="""This option is deprecated and does nothing.""")
         self.parser.add_argument("--trainer.optimization.momentum", type=float,
                                  dest='momentum', default=0.0,
                                  help="""Momentum used in update computation.
diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index ac1b40d29db..7cc341de028 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -28,26 +28,25 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Computes the objective of the moving average of nnet on egs. If either of
-// batchnorm/dropout test modes is true, we make a copy of the moving average,
-// set test modes on that and evaluate its objective. Note: the object that
-// prob_computer->nnet_ refers to should be moving_average_nnet.
+// Computes the objective function for the examples in 'egs' given the model in
+// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of
+// 'nnet', set test modes on that and evaluate its objective.
+// Note: the object that prob_computer->nnet_ refers to should be 'nnet'.
 double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
-                   const std::vector<NnetChainExample> &egs,
-                   const Nnet &moving_average_nnet,
+                   const std::vector<NnetChainExample> &egs, const Nnet &nnet,
                    const chain::ChainTrainingOptions &chain_config,
                    const fst::StdVectorFst &den_fst,
                    NnetChainComputeProb *prob_computer) {
   if (batchnorm_test_mode || dropout_test_mode) {
-    Nnet moving_average_nnet_copy(moving_average_nnet);
+    Nnet nnet_copy(nnet);
     if (batchnorm_test_mode)
-      SetBatchnormTestMode(true, &moving_average_nnet_copy);
+      SetBatchnormTestMode(true, &nnet_copy);
     if (dropout_test_mode)
-      SetDropoutTestMode(true, &moving_average_nnet_copy);
+      SetDropoutTestMode(true, &nnet_copy);
     NnetComputeProbOptions compute_prob_opts;
     NnetChainComputeProb prob_computer_test(compute_prob_opts, chain_config,
-        den_fst, moving_average_nnet_copy);
-    return ComputeObjf(false, false, egs, moving_average_nnet_copy,
+        den_fst, nnet_copy);
+    return ComputeObjf(false, false, egs, nnet_copy,
                        chain_config, den_fst, &prob_computer_test);
   } else {
     prob_computer->Reset();
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index 220dd663e30..a2eb61d7905 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -28,25 +28,22 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Computes the objective of the moving average of nnet on egs. If either of
-// batchnorm/dropout test modes is true, we make a copy of the moving average,
-// set test modes on that and evaluate its objective. Note: the object that
-// prob_computer->nnet_ refers to should be moving_average_nnet.
+// Computes the objective function for the examples in 'egs' given the model in
+// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of
+// 'nnet', set test modes on that and evaluate its objective.
+// Note: the object that prob_computer->nnet_ refers to should be 'nnet'.
 double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
-                   const std::vector<NnetExample> &egs,
-                   const Nnet &moving_average_nnet,
+                   const std::vector<NnetExample> &egs, const Nnet &nnet,
                    NnetComputeProb *prob_computer) {
   if (batchnorm_test_mode || dropout_test_mode) {
-    Nnet moving_average_nnet_copy(moving_average_nnet);
+    Nnet nnet_copy(nnet);
     if (batchnorm_test_mode)
-      SetBatchnormTestMode(true, &moving_average_nnet_copy);
+      SetBatchnormTestMode(true, &nnet_copy);
     if (dropout_test_mode)
-      SetDropoutTestMode(true, &moving_average_nnet_copy);
+      SetDropoutTestMode(true, &nnet_copy);
     NnetComputeProbOptions compute_prob_opts;
-    NnetComputeProb prob_computer_test(compute_prob_opts,
-                                       moving_average_nnet_copy);
-    return ComputeObjf(false, false, egs, moving_average_nnet_copy,
-                       &prob_computer_test);
+    NnetComputeProb prob_computer_test(compute_prob_opts, nnet_copy);
+    return ComputeObjf(false, false, egs, nnet_copy, &prob_computer_test);
   } else {
     prob_computer->Reset();
     std::vector<NnetExample>::const_iterator iter = egs.begin(),

From 1fa9ae85c4bb7270f44c105d6dcff5f9e7fcbf6b Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 12 Dec 2017 21:57:40 -0500
Subject: [PATCH 016/184] fix

---
 src/chainbin/nnet3-chain-combine.cc | 6 +++---
 src/nnet3bin/nnet3-combine.cc       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/chainbin/nnet3-chain-combine.cc b/src/chainbin/nnet3-chain-combine.cc
index 7cc341de028..ca0428553c1 100644
--- a/src/chainbin/nnet3-chain-combine.cc
+++ b/src/chainbin/nnet3-chain-combine.cc
@@ -28,9 +28,9 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Computes the objective function for the examples in 'egs' given the model in
-// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of
-// 'nnet', set test modes on that and evaluate its objective.
+// Computes and returns the objective function for the examples in 'egs' given
+// the model in 'nnet'. If either of batchnorm/dropout test modes is true, we
+// make a copy of 'nnet', set test modes on that and evaluate its objective.
 // Note: the object that prob_computer->nnet_ refers to should be 'nnet'.
 double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
                    const std::vector<NnetChainExample> &egs, const Nnet &nnet,
diff --git a/src/nnet3bin/nnet3-combine.cc b/src/nnet3bin/nnet3-combine.cc
index a2eb61d7905..4bcf4cdfb6d 100644
--- a/src/nnet3bin/nnet3-combine.cc
+++ b/src/nnet3bin/nnet3-combine.cc
@@ -28,9 +28,9 @@
 namespace kaldi {
 namespace nnet3 {
 
-// Computes the objective function for the examples in 'egs' given the model in
-// 'nnet'. If either of batchnorm/dropout test modes is true, we make a copy of
-// 'nnet', set test modes on that and evaluate its objective.
+// Computes and returns the objective function for the examples in 'egs' given
+// the model in 'nnet'. If either of batchnorm/dropout test modes is true, we
+// make a copy of 'nnet', set test modes on that and evaluate its objective.
 // Note: the object that prob_computer->nnet_ refers to should be 'nnet'.
 double ComputeObjf(bool batchnorm_test_mode, bool dropout_test_mode,
                    const std::vector<NnetExample> &egs, const Nnet &nnet,

From 126614ba43c6d2c483e7626d109fbb0d58bb44fd Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 12 Dec 2017 23:00:20 -0500
Subject: [PATCH 017/184] [src] Changes to how MemoryNormComponent behaves (use
 more up-to-date stats)

---
 src/nnet3/nnet-normalize-component.cc | 136 +++++++++++---------------
 src/nnet3/nnet-normalize-component.h  |  54 +++++-----
 2 files changed, 82 insertions(+), 108 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 596ab48b9e5..3e72ca1a152 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -869,14 +869,6 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   Memo *memo = NULL;
   if (!test_mode_) {
     memo = GetMemo(in);
-    if (false) { // temporary.
-      MemoryNormComponent *temp = new MemoryNormComponent(*this);
-      temp->StoreStats(in, *out, memo);
-      Memo *new_memo = temp->GetMemo(in);
-      delete memo;
-      memo = new_memo;
-      delete temp;
-    }
   }
 
   if (test_mode_) {
@@ -884,8 +876,8 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
   } else {
-    CuSubVector<BaseFloat> x_mean(memo->data, 5),
-        scale(memo->data, 2);
+    CuSubVector<BaseFloat> x_mean(memo->data, 0),
+        scale(memo->data, 4);
     out->AddVecToRows(-1.0, x_mean);
     out->MulColsVec(scale);
   }
@@ -895,52 +887,41 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
 
 MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
     const CuMatrixBase<BaseFloat> &in) const {
-  KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_);
+  KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_ &&
+               stats_count_ >= 0.0);
   Memo *memo = new Memo;
-  int32 num_frames = in.NumRows();
-  memo->num_frames = num_frames;
-  memo->data.Resize(6, block_dim_);
-  CuSubVector<BaseFloat> x_sum(memo->data, 0),
-      x_sumsq(memo->data, 1);
-  x_sum.AddRowSumMat(1.0, in, 0.0);
-  x_sumsq.AddDiagMat2(1.0, in, kTrans, 0.0);
-  if (stats_count_ > 0.0) {
-    memo->has_indirect_terms = include_indirect_derivative_;
-    if (include_indirect_derivative_) {
-      // copy over scale, x_deriv and scale_deriv.
-      memo->data.RowRange(2, 3).CopyFromMat(data_.RowRange(4, 3));
-    } else {
-      // just copy over the scale.  x_deriv and scale_deriv remain zero.
-      memo->data.Row(2).CopyFromVec(data_.Row(4));
-    }
-    // get 'x_mean'
-    memo->data.Row(5).CopyFromVec(data_.Row(0));
-  } else {
-    // We should only reach this point on when processing the first
-    // minibatch of each training job.
-
-    // note: 'x_deriv' and 'scale_deriv' will be zero.  This means we're
-    // ignoring the smaller, indirect term in the derivative for the first
-    // minibatch of each training job.  That indirect term is really not that
-    // important that we should worry much about this.
-    memo->has_indirect_terms = false;
-
-    CuSubVector<BaseFloat> scale(memo->data, 2);
-    scale.CopyFromVec(x_sumsq);
-    scale.AddVecVec(-1.0 / (num_frames * 1.0 * num_frames),
-                    x_sum, x_sum, 1.0 / num_frames);
-    // At this point 'scale' is the variance.
-    // We apply the floor at 0.0 as a failsafe for problems caused by roundoff.
-    scale.ApplyFloor(0.0);
-    scale.Add(epsilon_);
-    // At this point 'scale' is the variance plus epsilon.
-    scale.ApplyPow(-0.5);
-    // OK, now 'scale' is the actual scale: the inverse standard deviation.
+  BaseFloat old_stats_count = stats_count_,
+      num_frames = in.NumRows(),
+      new_stats_count = num_frames + old_stats_count,
+      old_weight = old_stats_count / new_stats_count;
 
-    // get 'x_mean'
-    CuSubVector<BaseFloat> x_mean(memo->data, 5);
-    x_mean.CopyFromVec(x_sum);
-    x_mean.Scale(1.0 / num_frames);
+  // The information in 'memo' will be copied to *this when
+  // StoreStats() is caled (we can't update it in the Propagate()
+  // function for 'const' reasons).
+  memo->stats_count = new_stats_count;
+  memo->backward_count = backward_count_;
+  memo->data = data_;
+
+  CuSubVector<BaseFloat> x_mean(memo->data, 0),
+      x_uvar(memo->data, 1), scale(memo->data, 4);
+  // Each row of 'in' gets a weight of 1.0 / new_stats_count in the stats.
+  x_mean.AddRowSumMat(1.0 / new_stats_count, in, old_weight);
+  x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight);
+
+  scale.CopyFromVec(x_uvar);
+  scale.AddVecVec(-1.0, x_mean, x_mean, 1.0);
+  // at this point, 'scale' is the variance.
+  scale.ApplyFloor(0.0);
+  scale.Add(epsilon_);
+  scale.ApplyPow(-0.5);
+  // OK, now 'scale' is the scale.
+
+  if (backward_count_ != 0.0) {
+    // we have stats 'y_deriv' and 'y_deriv_y' and we need to update the
+    // quantities x_deriv = y_deriv * scale, and scale_deriv = y_deriv_y *
+    // scale.
+    memo->data.RowRange(5, 2).AddMatDiagVec(
+        1.0, memo->data.RowRange(2, 2), kNoTrans, scale, 0.0);
   }
   return memo;
 }
@@ -993,6 +974,7 @@ void MemoryNormComponent::Backprop(
   // have the backprop called if the in_deriv is non-NULL.
 
   if (test_mode_) {
+    // In test mode we treat it as a fixed scale and offset.
     KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0);
     // the following is a no-op if in_deriv and out_deriv are the same matrix.
     in_deriv->CopyFromMat(out_deriv);
@@ -1041,11 +1023,12 @@ void MemoryNormComponent::Backprop(
   in_deriv->CopyFromMat(out_deriv);
 
   Memo *memo = static_cast<Memo*>(memo_in);
-  CuSubVector<BaseFloat> scale(memo->data, 2);
+  CuSubVector<BaseFloat> scale(memo->data, 4);
   in_deriv->MulColsVec(scale);
-  if (memo->has_indirect_terms) {
-    CuSubVector<BaseFloat> x_deriv(memo->data, 3),
-        scale_deriv(memo->data, 4);
+
+  if (memo->backward_count != 0.0) {
+    CuSubVector<BaseFloat> x_deriv(memo->data, 5),
+        scale_deriv(memo->data, 6);
     in_deriv->AddVecToRows(-1.0, x_deriv);
     in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv);
   }
@@ -1090,26 +1073,23 @@ void MemoryNormComponent::StoreStats(
   // required statistics are already stored in 'memo_in'.
   Memo *memo = static_cast<Memo*>(memo_in);
 
-  BaseFloat num_frames = memo->num_frames,
-      old_stats_count = stats_count_,
-      new_stats_count = num_frames + old_stats_count,
-      old_weight = old_stats_count / new_stats_count;
-
-  // x_mean_and_x_uvar is the first 2 rows of data_.
-  CuSubMatrix<BaseFloat> x_mean_and_x_uvar(data_, 0, 2, 0, block_dim_);
-  // x_sum_and_x_sumsq is the first 2 rows of data_.
-  CuSubMatrix<BaseFloat> x_sum_and_x_sumsq(memo->data, 0, 2, 0, block_dim_);
-
-  x_mean_and_x_uvar.Scale(old_weight);
-  // The factor 1.0 / new_stats_count that appears below can be perhaps more
-  // clearly written as follows: first define
-  //       new_weight = num_frames / new_stats_count
-  // and then write 'new_weight / num_frames', which simplifies to
-  // '1.0 / new_stats_count'.  The factor of '1.0 / num_frames'
-  // is necessary to convert from data sums to a per-frame average.
-  x_mean_and_x_uvar.AddMat(1.0 / new_stats_count, x_sum_and_x_sumsq);
-  stats_count_ = new_stats_count;
-  ComputeDerived();
+  // check that the memo's stats count is more than our stats_count_,
+  // which it should be because the memo should have added extra stats,
+  // and StoreStats() should be called directly after the Propagate()
+  // function.
+  // This could possibly fail with memo_in->stats_count == stats_count_
+  // due to roundoff, if you trained with batchnorm-stats-scale set at 1,
+  // but that would be a poor choice of parameters anyway as
+  // roundoff would be a big problem.
+  KALDI_ASSERT(memo->stats_count > stats_count_);
+
+  stats_count_ = memo->stats_count;
+  // Copying the entire data matrix should be safe because
+  // StoreStats() is always called directly after the corresponding
+  // Propagate(), and on the same object; and there should be
+  // no possibility that other things in this->data changed in
+  // the interim.
+  data_.CopyFromMat(memo->data);
 }
 
 void MemoryNormComponent::Read(std::istream &is, bool binary) {
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index bc80421700a..7d6cafa098b 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -400,36 +400,30 @@ class MemoryNormComponent: public Component {
  private:
 
   struct Memo {
-    // The number of frames (after any reshaping; so in general it will
-    // be the original NumRows() of the matrix, times dim_ / block_dim_).
-    int32 num_frames;
-    // 'data' is of dimension 6 by block_dim_.
-    // Row 0, which we'll call 'x_sum', is the sum of the rows of the
-    //  input data.
-    // Row 1, which we'll call 'x_sumsq', is the sum of the rows of the
-    //   elementwise square of the input data matrix.
-    // Row 2,3,4 are 'scale', 'x_deriv', 'scale_deriv', which
-    //         are just copies of the corresponding values in
-    //         MemoryNormComponent::data_ (from the const nnet, the one we're
-    //         training), and which will have been copied from there when this
-    //         object was created.  However if stats_count_ was <= 0 when this
-    //         object was created (first minibatch), then 'scale'
-    //         will be set to the mean and inverse-stddev implied by the stats
-    //         'sum' and 'sumsq', and 'x_deriv' and 'scale_deriv' will be zero.
-    //         This is so that it does something sensible on the very first
-    //         minibatch we train.  The reason why we copy these quantities here
-    //         is because in the backprop phase we feel it would be better to
-    //         use the same values that were used in the forward propagation,
-    //         instead of the possibly-updated values that might exist when
-    //         Backprop() is called.  It's actually not clear whether this is
-    //         necessary.
-    //  Row 5 ('x_mean') is a copy of the data mean the data wasnormalized with.
+    // 'stats_count' is the same as stats_count_ in the MemoryNormComponent
+    // from whose Propagate() function this memo was generated, plus
+    // the number of frames we're propagating (this is after any reshaping
+    // if block_dim_ != dim_).
+    BaseFloat stats_count;
+
+    // 'stats_count' is the same as stats_count_ in the MemoryNormComponent
+    // from whose Propagate() function this memo was generated.  It's mainly
+    // included because the backprop code wants to see if this was nonzero.
+    BaseFloat backward_count;
+
+    // The structure of 'data' is the same as the data_ member of
+    // MemoryNormComponent; it's a matrix of dimension 7 by block_dim_.
+    // It will differ from the data_ member of the component we generated this
+    // from by the addition of some extra data in the 'x_sum' and 'x_sumsq'
+    // stats, and a corresponding modification of the 'scale', 'x_deriv'
+    // and 'scale_deriv' quantities.
+    //
+    // (note: the reason we update the stats before propagation rather
+    // than after, is for stability: otherwise, with relu units, if we only
+    // update the stats after the propagation we get a particular pathology: if
+    // a unit was previously always zero it will get a big scale; and if then we
+    // start getting some nonzero output, the scale on it will be too large.)
     CuMatrix<BaseFloat> data;
-
-    // This is set to true if we have the 'indirect' terms in the derivative,
-    // relating to the 'x_deriv' and 'scale_deriv' terms in 'data'.  If false,
-    // we save some computation.
-    bool has_indirect_terms;
   };
 
 
@@ -503,7 +497,7 @@ class MemoryNormComponent: public Component {
   // We store data_ as a single matrix because it enables certain operations
   // to be done using fewer kernels, but it contains various different quantities,
   // which we'll describe below as if they were separate variables.
-  // data_ is of dimension 6 by block_dim_.
+  // data_ is of dimension 7 by block_dim_.
   CuMatrix<BaseFloat> data_;
   // data_.Row(0) is 'x_mean', which is the decaying moving-average of
   //             input data x; or zero if stats_count_ is zero.

From 81335d085f3e9c9b87aa71483e0ed3140446eaa3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 13 Dec 2017 23:08:44 -0500
Subject: [PATCH 018/184] [src] Simplify NormalizeLayer.

---
 src/nnet3/nnet-normalize-component.cc | 97 +++++++++++++--------------
 src/nnet3/nnet-normalize-component.h  | 15 ++---
 2 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 3e72ca1a152..f305c196504 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -714,7 +714,7 @@ void BatchNormComponent::ZeroStats() {
    We can compute:
       mean = sum / count
       var = epsilon + (sumsq / count) - (mean * mean)
-      scale = var^{-0.5}
+      scale = target_rms * var^{-0.5}
 
       y(i) = (x(i) - mean) * scale.
 
@@ -729,8 +729,11 @@ void BatchNormComponent::ZeroStats() {
        mean' = -scale * \sum_i w(i) y'(i)
       scale' = \sum_i w(i) y'(i) (x(i) - mean)
              = 1/scale \sum_i w(i) y'(i) y(i)
-        var' = -0.5 var^{-1.5} scale'
-             = -0.5 var^{-1} \sum_i w(i) y'(i) y(i)
+        var' = -0.5 target_rms var^{-1.5} scale'
+             = -0.5 target_rms var^{-1.5} (1/scale) \sum_i w(i) y'(i) y(i)
+                 .. and using 1/scale = var^{0.5}/target_rms,
+             = -0.5 var^{-1} \sum_i w(i) y'(i) y(i)                      (*)
+
 
    It will be convenient to write down 'per-frame' versions of all of these
    quantities, which are divided by the total count:
@@ -752,16 +755,23 @@ void BatchNormComponent::ZeroStats() {
 
         x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean)
               = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale
-              = y'(i)*scale + mean_norm' - y(i) * scale/count * \sum_i w(i) y'(i) y(i)
-
-    I'm afraid I just pulled the above out of thin air... needs some more
-    derivation. The part about (x(i) - mean) can be obtained, I believe,
-    from computation of the derivative of the variance w.r.t. the x(i) values.
-
+               ... and substituting in the equation (*) above for var', using var_norm' = var'/scale,
+               and rearranging slightly:
+              = y'(i)*scale + mean_norm' - y(i) * var^{-1}/scale * 1/count * \sum_i w(i) y'(i) y(i)
+              .. and using scale=target-rms * var^{-0.5}, so var^{-1}/scale = var^{-0.5}/target-rms = scale/target-rms^2:
+              = y'(i)*scale + mean_norm' - y(i) * scale/(count*target-rms^2) * \sum_i w(i) y'(i) y(i)
+            .. and considering that the factor of 'scale' appears (directly or indirectly) in all 3
+              of the terms in the above expression, we can reorganize this as:
+              = scale * (y'(i) - 1/count*\sum_i w(i)*y(i) - 1/(count*target-rms^2) * \sum_i w(i) y'(i) y(i))
 */
 
 
 void MemoryNormComponent::SetTestMode(bool test_mode) {
+  if (test_mode && stats_count_ <= 0) {
+    KALDI_WARN << "Refusing to set test-mode in MemoryNormComponent since no "
+        "stats are present.";
+    return;
+  }
   test_mode_ = test_mode;
 }
 
@@ -795,16 +805,13 @@ std::string MemoryNormComponent::Info() const {
   if (stats_count_ > 0.0) {
     CuSubVector<BaseFloat> x_mean(data_, 0),
         y_deriv(data_, 2), y_deriv_y(data_, 3),
-        scale(data_, 4), x_deriv(data_, 5),
-        scale_deriv(data_, 6);
+        scale(data_, 4);
     if (stats_count_ > 0.0)
       stream << ", x-mean=" << SummarizeVector(x_mean)
              << ", scale=" << SummarizeVector(scale);
     if (backward_count_ > 0.0)
       stream << ", y-deriv=" << SummarizeVector(y_deriv)
-             << ", y-deriv-y=" << SummarizeVector(y_deriv_y)
-             << ", x-deriv=" << SummarizeVector(x_deriv)
-             << ", scale-deriv=" << SummarizeVector(scale_deriv);
+             << ", y-deriv-y=" << SummarizeVector(y_deriv_y);
   }
   return stream.str();
 }
@@ -836,7 +843,7 @@ void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) {
               << cfl->UnusedValues();
   stats_count_ = 0.0;
   backward_count_ = 0.0;
-  data_.Resize(7, block_dim_);
+  data_.Resize(5, block_dim_);
 }
 
 
@@ -862,7 +869,7 @@ void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     out->CopyFromMat(in);
 
   if (test_mode_ && stats_count_ <= 0.0)
-      KALDI_ERR << "Test mode set but no stats available.";
+    KALDI_ERR << "Test mode set but no stats available.";
 
   // From this point, we can assume that the num-cols of 'in' and 'out'
   // equals block_dim_.
@@ -909,20 +916,15 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
   x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight);
 
   scale.CopyFromVec(x_uvar);
-  scale.AddVecVec(-1.0, x_mean, x_mean, 1.0);
+  // we save a CUDA operation by applying the scale 'target_rms_scale' before doing
+  // ApplyPow(-0.5), and this requires taking it to the power -2.
+  BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_);
+  scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale);
   // at this point, 'scale' is the variance.
   scale.ApplyFloor(0.0);
-  scale.Add(epsilon_);
+  scale.Add(epsilon_ * target_rms_scale);
   scale.ApplyPow(-0.5);
   // OK, now 'scale' is the scale.
-
-  if (backward_count_ != 0.0) {
-    // we have stats 'y_deriv' and 'y_deriv_y' and we need to update the
-    // quantities x_deriv = y_deriv * scale, and scale_deriv = y_deriv_y *
-    // scale.
-    memo->data.RowRange(5, 2).AddMatDiagVec(
-        1.0, memo->data.RowRange(2, 2), kNoTrans, scale, 0.0);
-  }
   return memo;
 }
 
@@ -1013,8 +1015,8 @@ void MemoryNormComponent::Backprop(
     to_update->backward_count_ = new_backward_count;
     // We don't bother calling to_update->ComputeDerived()-- although it would
     // be harmless-- because in the current situations where this code is
-    // reached, to_update will be the delta_nnet_, and the derived parameters of
-    // delta_nnet_ aren't used.
+    // reached, to_update will be the delta_nnet_, and the derived parameter
+    // 'scale') of delta_nnet_ aren't used.
 
     // to_update->ComputeDerived();
   }
@@ -1023,42 +1025,37 @@ void MemoryNormComponent::Backprop(
   in_deriv->CopyFromMat(out_deriv);
 
   Memo *memo = static_cast<Memo*>(memo_in);
+  if (memo->backward_count != 0.0) {
+    CuSubVector<BaseFloat> y_deriv(memo->data, 2),
+        y_deriv_y(memo->data, 3);
+    in_deriv->AddVecToRows(-1.0, y_deriv);
+    in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_),
+                            out_value, kNoTrans, y_deriv_y);
+  }
   CuSubVector<BaseFloat> scale(memo->data, 4);
   in_deriv->MulColsVec(scale);
 
-  if (memo->backward_count != 0.0) {
-    CuSubVector<BaseFloat> x_deriv(memo->data, 5),
-        scale_deriv(memo->data, 6);
-    in_deriv->AddVecToRows(-1.0, x_deriv);
-    in_deriv->AddMatDiagVec(-1.0, out_value, kNoTrans, scale_deriv);
-  }
 }
 
 
 void MemoryNormComponent::ComputeDerived() {
-  KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 7);
+  KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 5);
   if (stats_count_ == 0.0) {
-    // zero 'scale', 'x_deriv' and 'scale_deriv'.
-    data_.RowRange(4, 3).SetZero();
+    // zero 'scale'.
+    data_.Row(4).SetZero();
     return;
   }
   CuSubVector<BaseFloat>  x_mean(data_, 0), x_uvar(data_, 1),
-      y_deriv(data_, 2), y_deriv_y(data_, 3), scale(data_, 4);
+       scale(data_, 4);
   scale.CopyFromVec(x_uvar);
-  scale.AddVecVec(-1.0, x_mean, x_mean, 1.0);
-  // at this point, 'scale' is the variance.
+  // we save a CUDA operation by applying the scale 'target_rms_scale' before doing
+  // ApplyPow(-0.5), and this requires taking it to the power -2.
+  BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_);
+  scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale);
+  // at this point, 'scale' is the variance (divided by target_rms^2).
   scale.ApplyFloor(0.0);
-  scale.Add(epsilon_);
+  scale.Add(epsilon_ * target_rms_scale);
   scale.ApplyPow(-0.5);
-  if (backward_count_ == 0.0) {
-    // The following statement sets x_deriv and scale_deriv to zero.
-    data_.RowRange(5, 2).SetZero();
-  } else {
-    // The following statement sets x_deriv = y_deriv * scale,
-    // and scale_deriv = y_deriv_y * scale.
-    data_.RowRange(5, 2).AddMatDiagVec(1.0,
-        data_.RowRange(2, 2), kNoTrans, scale, 0.0);
-  }
 }
 
 void MemoryNormComponent::StoreStats(
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 7d6cafa098b..e32ad549c28 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -164,6 +164,8 @@ class BatchNormComponent: public Component {
   // accumulate these stats; they are stored as a matter of course on each
   // iteration of training, as for NonlinearComponents, and we'll use the stats
   // from the most recent [script-level] iteration.
+  // (Note: it will refuse to actually set test-mode to true if there
+  // are no stats stored.)
   void SetTestMode(bool test_mode);
 
   // constructor using another component
@@ -412,7 +414,7 @@ class MemoryNormComponent: public Component {
     BaseFloat backward_count;
 
     // The structure of 'data' is the same as the data_ member of
-    // MemoryNormComponent; it's a matrix of dimension 7 by block_dim_.
+    // MemoryNormComponent; it's a matrix of dimension 5 by block_dim_.
     // It will differ from the data_ member of the component we generated this
     // from by the addition of some extra data in the 'x_sum' and 'x_sumsq'
     // stats, and a corresponding modification of the 'scale', 'x_deriv'
@@ -497,7 +499,7 @@ class MemoryNormComponent: public Component {
   // We store data_ as a single matrix because it enables certain operations
   // to be done using fewer kernels, but it contains various different quantities,
   // which we'll describe below as if they were separate variables.
-  // data_ is of dimension 7 by block_dim_.
+  // data_ is of dimension 5 by block_dim_.
   CuMatrix<BaseFloat> data_;
   // data_.Row(0) is 'x_mean', which is the decaying moving-average of
   //             input data x; or zero if stats_count_ is zero.
@@ -511,18 +513,11 @@ class MemoryNormComponent: public Component {
   //           objective w.r.t. the output); or zero if backward_count_
   //           is zero.
   //
-  // The quantities below are derived from the stats above.
+  // The quantity below is derived from the stats above.
   //
   // data_.Row(4) is 'scale', which is the inverse square root of the
   //            covariance computed from x_mean and x_uvar (plus epsilon),
   //            or zero if stats_count_ is zero.
-  // data_.Row(5) is 'x_deriv', which is the negative of the average derivative
-  //           (per frame) of the objective function w.r.t the input x (just the
-  //           part that comes via the derivative w.r.t. the x mean).
-  //           'x_deriv' equals 'y_deriv' times 'scale'.
-  // data_.Row(6) is 'scale_deriv', which relates to the part of the
-  //           derivative w.r.t. the input that comes from the objf
-  //           derivative w.r.t. the scale.  It equals scale * y_deriv_y.
 };
 
 

From aefb72eb8cd8664415d08dad7313d0d4170446ab Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 14 Dec 2017 22:12:38 -0500
Subject: [PATCH 019/184] [src] Simplify MemoryNormComponent and always store
 stats during backprop on nnet_to_store_stats_

---
 src/nnet3/nnet-compute.cc             | 20 +++++++++++++++-----
 src/nnet3/nnet-normalize-component.cc | 27 ++++++++++++++-------------
 src/nnet3/nnet-normalize-component.h  |  4 ++--
 src/nnet3/nnet-test-utils.cc          | 10 ----------
 4 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 31d60e1ce4a..12a4ec65ae9 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -261,11 +261,21 @@ void NnetComputer::ExecuteCommand() {
         debug_str << nnet_.GetComponentName(c.arg1);
         const Component *component = nnet_.GetComponent(c.arg1);
         KALDI_ASSERT(!(computation_.need_model_derivative && !nnet_to_update_));
-        Component *upd_component = (nnet_to_update_ &&
-                                    c.command_type == kBackprop &&
-                                    computation_.need_model_derivative ?
-                                    nnet_to_update_->GetComponent(c.arg1) :
-                                    NULL);
+        Component *upd_component = NULL;
+        if (c.command_type == kBackprop) {  // this block sets 'upd_component'
+          Nnet *nnet_to_update;
+          if (component->Properties()&kUpdatableComponent) {
+            nnet_to_update = (computation_.need_model_derivative ?
+                              nnet_to_update_ : NULL);
+          } else {
+            // Some non-updatable components, such as CompositeComponent and
+            // MemoryNormComponent, store stats in the backprop.  For other
+            // types of component, this arg won't matter.
+            nnet_to_update = nnet_to_store_stats_;
+          }
+          if (nnet_to_update)
+            upd_component = nnet_to_update->GetComponent(c.arg1);
+        }
         ComponentPrecomputedIndexes *indexes =
             computation_.component_precomputed_indexes[c.arg2].data;
         const CuSubMatrix<BaseFloat> in_value(GetSubMatrix(c.arg3));
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index f305c196504..1e3314bf91f 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -903,7 +903,7 @@ MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
       old_weight = old_stats_count / new_stats_count;
 
   // The information in 'memo' will be copied to *this when
-  // StoreStats() is caled (we can't update it in the Propagate()
+  // StoreStats() is called (we can't update it in the Propagate()
   // function for 'const' reasons).
   memo->stats_count = new_stats_count;
   memo->backward_count = backward_count_;
@@ -1013,28 +1013,25 @@ void MemoryNormComponent::Backprop(
     y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans,
                             out_value, kNoTrans, old_weight);
     to_update->backward_count_ = new_backward_count;
-    // We don't bother calling to_update->ComputeDerived()-- although it would
-    // be harmless-- because in the current situations where this code is
-    // reached, to_update will be the delta_nnet_, and the derived parameter
-    // 'scale') of delta_nnet_ aren't used.
 
-    // to_update->ComputeDerived();
+    // Now 'to_update' will typically be the same as 'this', so we need
+    // to compute the derived parameters because it affects some code that's
+    // below.
+    to_update->ComputeDerived();
   }
 
   // the following does no work if in_deriv and out_deriv are the same matrix.
   in_deriv->CopyFromMat(out_deriv);
 
-  Memo *memo = static_cast<Memo*>(memo_in);
-  if (memo->backward_count != 0.0) {
-    CuSubVector<BaseFloat> y_deriv(memo->data, 2),
-        y_deriv_y(memo->data, 3);
+  if (this->backward_count_ != 0.0) {
+    CuSubVector<BaseFloat> y_deriv(data_, 2),
+        y_deriv_y(data_, 3);
     in_deriv->AddVecToRows(-1.0, y_deriv);
     in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_),
                             out_value, kNoTrans, y_deriv_y);
   }
-  CuSubVector<BaseFloat> scale(memo->data, 4);
+  CuSubVector<BaseFloat> scale(data_, 4);
   in_deriv->MulColsVec(scale);
-
 }
 
 
@@ -1084,7 +1081,7 @@ void MemoryNormComponent::StoreStats(
   // Copying the entire data matrix should be safe because
   // StoreStats() is always called directly after the corresponding
   // Propagate(), and on the same object; and there should be
-  // no possibility that other things in this->data changed in
+  // no possibility that other things in this->data_ changed in
   // the interim.
   data_.CopyFromMat(memo->data);
 }
@@ -1169,6 +1166,10 @@ void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) {
     return;
   }
 
+  if (alpha * other->stats_count_ == 0.0 &&
+      alpha * other->backward_count_ == 0.0)
+    return;
+
   BaseFloat
       new_stats_count = stats_count_ + alpha * other->stats_count_,
       new_backward_count = backward_count_ + alpha * other->backward_count_;
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index e32ad549c28..5299862ee65 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -296,11 +296,11 @@ class BatchNormComponent: public Component {
 
 
 /*
-  MemoryNormComponent
-
   MemoryNormComponent is like batch normalization, except the stats
   are accumulated as a weighted sum over past minibatches (if this is
   not the first minibatch), instead of over the current minibatch.
+  Caution: we don't test this component in the standard way because it
+  would fail the derivative tests.
 
   You can use it in the same way you would normally use BatchNormComponent.
 
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 6ed0b6f9191..83b902a9b90 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1711,16 +1711,6 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " learning-rate=" << learning_rate;
       break;
     }
-      /*    case 35: {
-      *component_type = "MemoryNormComponent";
-      int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2);
-      os << " dim=" << dim
-         << " block-dim=" << block_dim << " target-rms="
-         << RandInt(1, 2) << " include-indirect-derivative="
-         << (RandInt(0, 1) == 0 ? "true" : "false")
-         << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
-      break;
-      }*/
     default:
       KALDI_ERR << "Error generating random component";
   }

From 8fae6ae7319003b942cfe7b316aa48d03c9eeb86 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Dec 2017 02:05:54 -0500
Subject: [PATCH 020/184] [scripts] Script fix; update batchnorm(/memnorm)
 stats faster in first part of training.

---
 .../steps/libs/nnet3/train/chain_objf/acoustic_model.py  | 9 ++++++++-
 egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py      | 2 +-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 5b640510ea1..78993bca217 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -173,13 +173,19 @@ def train_new_models(dir, iter, srand, num_jobs,
                          (" --write-cache={0}/cache.{1}".format(dir, iter + 1)
                           if job == 1 else ""))
 
+        # For the first epoch (at most the first 15 iters), scale the batchnorm stats
+        # down more aggressively.  This affects memory-norm components.
+        batchnorm_opt=("--batchnorm-stats-scale=0.5"
+                       if num_archives_processed < (num_archives * frame_subsampling_factor) and iter < 15
+                       else "")
+
         thread = common_lib.background_command(
             """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                     nnet3-chain-train {parallel_train_opts} {verbose_opt} \
                     --apply-deriv-weights={app_deriv_wts} \
                     --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                     {cache_io_opts}  --xent-regularize={xent_reg} \
-                    {deriv_time_opts} \
+                    {deriv_time_opts} {batchnorm_opt} \
                     --print-interval=10 --momentum={momentum} \
                     --max-param-change={max_param_change} \
                     --backstitch-training-scale={backstitch_training_scale} \
@@ -199,6 +205,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         dir=dir, iter=iter, srand=iter + srand,
                         next_iter=iter + 1, job=job,
                         deriv_time_opts=" ".join(deriv_time_opts),
+                        batchnorm_opt=batchnorm_opt,
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 483883fdee4..c8a71e15672 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -805,7 +805,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             elif nonlinearity == 'memnorm':
                 line = ('component name={0}.{1}'
                         ' type=MemoryNormComponent dim={2}'
-                        ' target-rms={3} include-indirect-derivative=false '
+                        ' target-rms={3} '
                         ''.format(self.name, nonlinearity, output_dim,
                                   target_rms))
 

From 6148dccd9f5a9ac31cebbd5a9cf6c7bf6346861a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Dec 2017 17:42:19 -0500
Subject: [PATCH 021/184] [src,scripts] Add extra nnet3 diagnostics; add
 reothonormalize option for LinearComponent

---
 .../nnet3/train/chain_objf/acoustic_model.py  | 26 ++++++-
 .../nnet3/train/frame_level_objf/common.py    | 23 ++++++
 egs/wsj/s5/steps/make_phone_graph.sh          |  1 +
 egs/wsj/s5/utils/dict_dir_add_pronprobs.sh    | 18 ++++-
 .../s5/utils/lang/make_phone_bigram_lang.sh   |  4 +-
 src/nnet3/nnet-chain-training.cc              | 19 +++++
 src/nnet3/nnet-convolutional-component.cc     | 38 +++++-----
 src/nnet3/nnet-parse.cc                       | 27 ++++++-
 src/nnet3/nnet-parse.h                        | 23 +++++-
 src/nnet3/nnet-simple-component.cc            | 47 +++++++++---
 src/nnet3/nnet-simple-component.h             | 30 ++++++--
 src/nnet3/nnet-training.cc                    | 19 +++++
 src/nnet3/nnet-utils.cc                       | 73 +++++++++++++++++++
 src/nnet3/nnet-utils.h                        | 12 ++-
 src/nnet3bin/nnet3-show-progress.cc           | 13 +++-
 15 files changed, 315 insertions(+), 58 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 02a3b4c75d5..9b424a5e384 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -480,14 +480,34 @@ def compute_progress(dir, iter, run_opts):
     common_lib.background_command(
         """{command} {dir}/log/progress.{iter}.log \
                 nnet3-am-info {model} '&&' \
-                nnet3-show-progress --use-gpu=no \
-                    "nnet3-am-copy --raw=true {prev_model} - |" \
-                    "nnet3-am-copy --raw=true {model} - |"
+                nnet3-show-progress --use-gpu=no {prev_model} {model}
         """.format(command=run_opts.command,
                    dir=dir,
                    iter=iter,
                    model=model,
                    prev_model=prev_model))
+    if iter % 10 == 0 and iter > 0:
+        # Every 10 iters, print some more detailed information.
+        # full_progress.X.log contains some diagnostics of the difference in
+        # parameters, printed in the same format as from nnet3-info.
+        common_lib.background_command(
+            """{command} {dir}/log/full_progress.{iter}.log \
+            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   prev_model=prev_model))
+        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
+        # option which includes stats on the singular values of the parameter matrices.
+        common_lib.background_command(
+            """{command} {dir}/log/full_info.{iter}.log \
+            nnet3-info --verbose=2 {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model))
+
 
 def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str,
                    egs_dir, leaky_hmm_coefficient, l2_regularize,
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index f8a69c5ad84..9c09394ccb4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -447,6 +447,29 @@ def compute_progress(dir, iter, egs_dir,
         ''.format(command=run_opts.command, dir=dir,
                   iter=iter, model=model, prev_model=prev_model))
 
+    if iter % 10 == 0 and iter > 0:
+        # Every 10 iters, print some more detailed information.
+        # full_progress.X.log contains some diagnostics of the difference in
+        # parameters, printed in the same format as from nnet3-info.
+        common_lib.background_command(
+            """{command} {dir}/log/full_progress.{iter}.log \
+            nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model,
+                   prev_model=prev_model))
+        # full_info.X.log is just the nnet3-info of the model, with the --verbose=2
+        # option which includes stats on the singular values of the parameter matrices.
+        common_lib.background_command(
+            """{command} {dir}/log/full_info.{iter}.log \
+            nnet3-info --verbose=2 {model}
+        """.format(command=run_opts.command,
+                   dir=dir,
+                   iter=iter,
+                   model=model))
+
+
 
 def combine_models(dir, num_iters, models_to_combine, egs_dir,
                    minibatch_size_str,
diff --git a/egs/wsj/s5/steps/make_phone_graph.sh b/egs/wsj/s5/steps/make_phone_graph.sh
index 817f7d1f10b..aaf88cc66d2 100755
--- a/egs/wsj/s5/steps/make_phone_graph.sh
+++ b/egs/wsj/s5/steps/make_phone_graph.sh
@@ -8,6 +8,7 @@
 # is to be used for segmentation, and uses that together with a model to
 # make a decoding graph.
 # Uses SRILM.
+# See also utils/lang/make_phone_bigram_lm.sh.
 
 # Begin configuration section.
 stage=0
diff --git a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
index 50191cf90cb..59ae4a4c994 100755
--- a/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
+++ b/egs/wsj/s5/utils/dict_dir_add_pronprobs.sh
@@ -6,6 +6,11 @@
 #            2015  Hainan Xu
 
 
+# The thing that this script implements is described in the paper:
+# "PRONUNCIATION AND SILENCE PROBABILITY MODELING FOR ASR"
+# by Guoguo Chen et al, see
+# http://www.danielpovey.com/files/2015_interspeech_silprob.pdf
+
 . ./path.sh || exit 1;
 
 # begin configuration
@@ -73,7 +78,7 @@ fi
 # the cat and awk commands below are implementing add-one smoothing.
 cat <(awk '{print 1, $0;}' <$dir/lexicon.txt) $pron_counts | \
   awk '{ count = $1; $1 = ""; word_count[$2] += count; pron_count[$0] += count; pron2word[$0] = $2; }
-       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word]; 
+       END{ for (p in pron_count) { word = pron2word[p]; num = pron_count[p]; den = word_count[word];
           print num / den, p } } ' | \
     awk '{ word = $2; $2 = $1; $1 = word; print; }' | grep -v '^<eps>' |\
     sort -k1,1 -k2g,2 -k3 > $dir/lexiconp.txt
@@ -108,6 +113,11 @@ fi
 # Create $dir/lexiconp_silprob.txt and $dir/silprob.txt if silence counts file
 # exists. The format of $dir/lexiconp_silprob.txt is:
 # word pron-prob P(s_r | w)  F(s_l | w) F(n_l | w) pron
+#  where:  P(s_r | w) is the probability of silence to the right of the word
+#          F(s_l | w) is a factor which is greater than one if silence to the
+#                  left of the word is more than averagely probable.
+#          F(n_l | w) is a factor which is greater than one if nonsilence to the
+#                  left of the word is more than averagely probable.
 if [ -n "$sil_counts" ]; then
   if [ ! -s "$sil_counts" ]; then
     echo "$0: expected file $sil_counts to exist and not empty" && exit 1;
@@ -175,7 +185,7 @@ if [ -n "$sil_counts" ]; then
     # Computes F(s_l | w) and F(n_l | w) in the paper.
     $lambda3 = 2;             # Smoothing term, \lambda_3 in the paper.
     foreach my $wpron (keys %all_wprons) {
-      @col = split(" ", $wpron); 
+      @col = split(" ", $wpron);
       $word = shift @col;
       $pron = join(" ", @col);
       $pron_prob = $all_wprons{$wpron};
@@ -189,7 +199,7 @@ if [ -n "$sil_counts" ]; then
 
       print LPSP "$word $pron_prob $P_w_sr{$wpron} $F_sl_w $F_nl_w $pron\n";
     }
-    
+
     # Create silprob.txt
     $BOS_sil_count = $wpron_sil{"<s>"} + $sil_prob * $lambda2;
     $BOS_nonsil_count = $wpron_nonsil{"<s>"} + (1 - $sil_prob) * $lambda2;
@@ -206,7 +216,7 @@ if [ -n "$sil_counts" ]; then
 fi
 
 # now regenerate lexicon.txt from lexiconp.txt, to make sure the lines are
-# in the same order. 
+# in the same order.
 cat $dir/lexiconp.txt | awk '{$2 = ""; print;}' | sed 's/  / /g' >$dir/lexicon.txt
 
 
diff --git a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
index dcb77bb1342..1d3d04896b4 100755
--- a/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
+++ b/egs/wsj/s5/utils/lang/make_phone_bigram_lang.sh
@@ -9,10 +9,10 @@
 # is to limit the number of transitions, so we can decode reasonably fast, and the
 # graph won't blow up.  This is probably going to be most useful for things like
 # language-id.
+#
+#  See also steps/make_phone_graph.sh
 
 
-# We might later have options here; if not, I'll emove this.
-
 echo "$0 $@"  # Print the command line for logging
 
 [ -f ./path.sh ] && . ./path.sh; # source the path.
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 780a7115a8a..b24e81e7494 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -119,6 +119,10 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   // happens when we use the model with batchnorm test-mode set).
   ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
 
+  // The following will only do something if we have a LinearComponent
+  // with is-constrained-orthonormal set to true.
+  ConstrainOrthonormal(nnet_);
+
   // Scale delta_nnet
   if (success)
     ScaleNnet(nnet_config.momentum, delta_nnet_);
@@ -167,6 +171,21 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
       nnet_config.max_param_change, max_change_scale, scale_adding, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
+  if (is_backstitch_step1) {
+    // The following will only do something if we have a LinearComponent
+    // with is-constrained-orthonormal set to true.  We choose to do this
+    // only on the 1st backstitch step, for efficiency.
+    ConstrainOrthonormal(nnet_);
+  }
+
+  if (!is_backstitch_step1) {
+    // Scale down the batchnorm stats (keeps them fresh... this affects what
+    // happens when we use the model with batchnorm test-mode set).  Do this
+    // after backstitch step 2 so that the stats are scaled down before we start
+    // the next minibatch.
+    ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
+  }
+
   ScaleNnet(0.0, delta_nnet_);
 }
 
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index f689984e876..333d7a79cfa 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -263,18 +263,14 @@ void TimeHeightConvolutionComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("alpha-out", &alpha_out);
   cfl->GetValue("num-minibatches-history", &num_minibatches_history);
 
-  preconditioner_in_.SetAlpha(alpha_in);
-  preconditioner_out_.SetAlpha(alpha_out);
   int32 dim_in = linear_params_.NumCols() + 1,
       dim_out = linear_params_.NumRows();
-  if (rank_in < 0) {
+  if (rank_in < 0)
     rank_in = std::min<int32>(80, (dim_in + 1) / 2);
-    preconditioner_in_.SetRank(rank_in);
-  }
-  if (rank_out < 0) {
+  preconditioner_in_.SetRank(rank_in);
+  if (rank_out < 0)
     rank_out = std::min<int32>(80, (dim_out + 1) / 2);
-    preconditioner_out_.SetRank(rank_out);
-  }
+  preconditioner_out_.SetRank(rank_out);
   preconditioner_in_.SetNumMinibatchesHistory(num_minibatches_history);
   preconditioner_out_.SetNumMinibatchesHistory(num_minibatches_history);
 
@@ -360,29 +356,29 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient(
     const CuMatrixBase<BaseFloat> &in_value,
     const CuMatrixBase<BaseFloat> &out_deriv) {
 
-  CuVector<BaseFloat> bias_temp(bias_params_.Dim());
+  CuVector<BaseFloat> bias_deriv(bias_params_.Dim());
 
-  { // this block computes 'bias_temp', the derivative w.r.t. the bias.
+  { // this block computes 'bias_deriv', the derivative w.r.t. the bias.
     KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() &&
                  out_deriv.NumCols() ==
                  model_.height_out * model_.num_filters_out);
     CuSubMatrix<BaseFloat> out_deriv_reshaped(
         out_deriv.Data(), out_deriv.NumRows() * model_.height_out,
         model_.num_filters_out, model_.num_filters_out);
-    bias_temp.AddRowSumMat(1.0, out_deriv_reshaped);
+    bias_deriv.AddRowSumMat(1.0, out_deriv_reshaped);
   }
 
-  CuMatrix<BaseFloat> params_temp(linear_params_.NumRows(),
+  CuMatrix<BaseFloat> params_deriv(linear_params_.NumRows(),
                                   linear_params_.NumCols() + 1);
-  params_temp.CopyColFromVec(bias_temp, linear_params_.NumCols());
+  params_deriv.CopyColFromVec(bias_deriv, linear_params_.NumCols());
 
 
-  CuSubMatrix<BaseFloat> linear_params_temp(
-      params_temp, 0, linear_params_.NumRows(),
+  CuSubMatrix<BaseFloat> linear_params_deriv(
+      params_deriv, 0, linear_params_.NumRows(),
       0, linear_params_.NumCols());
 
   ConvolveBackwardParams(indexes.computation, in_value, out_deriv,
-                         1.0, &linear_params_temp);
+                         1.0, &linear_params_deriv);
 
   // the precondition-directions code outputs a scalar that
   // must be multiplied by its output (this saves one
@@ -393,22 +389,22 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient(
   // scalars are different across iterations, the scalars
   // will be pretty similar on different iterations
   BaseFloat scale1, scale2;
-  preconditioner_in_.PreconditionDirections(&params_temp, NULL,
+  preconditioner_in_.PreconditionDirections(&params_deriv, NULL,
                                             &scale1);
 
 
-  CuMatrix<BaseFloat> params_temp_transpose(params_temp, kTrans);
-  preconditioner_out_.PreconditionDirections(&params_temp_transpose,
+  CuMatrix<BaseFloat> params_deriv_transpose(params_deriv, kTrans);
+  preconditioner_out_.PreconditionDirections(&params_deriv_transpose,
                                              NULL, &scale2);
 
 
   linear_params_.AddMat(
       learning_rate_ * scale1 * scale2,
-      params_temp_transpose.RowRange(0, linear_params_.NumCols()),
+      params_deriv_transpose.RowRange(0, linear_params_.NumCols()),
       kTrans);
 
   bias_params_.AddVec(learning_rate_ * scale1 * scale2,
-                      params_temp_transpose.Row(linear_params_.NumCols()));
+                      params_deriv_transpose.Row(linear_params_.NumCols()));
 }
 
 
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 2c4da825013..37d44a89673 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -537,7 +537,10 @@ void PrintParameterStats(std::ostringstream &os,
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuMatrix<BaseFloat> &params,
-                         bool include_mean) {
+                         bool include_mean,
+                         bool include_row_norms,
+                         bool include_column_norms,
+                         bool include_singular_values) {
   os << std::setprecision(4);
   os << ", " << name << '-';
   int32 dim = params.NumRows() * params.NumCols();
@@ -551,8 +554,26 @@ void PrintParameterStats(std::ostringstream &os,
     os << "rms=" << rms;
   }
   os << std::setprecision(6);  // restore the default precision.
-  if (GetVerboseLevel() >= 2) {
-    // At verbose level >= 2, print stats of the singular values of the matrix.
+
+  if (include_row_norms) {
+    CuVector<BaseFloat> row_norms(params.NumRows());
+    row_norms.AddDiagMat2(1.0, params, kNoTrans, 0.0);
+    row_norms.ApplyPow(0.5);
+    Vector<BaseFloat> row_norms_cpu;
+    row_norms.Swap(&row_norms_cpu);
+    os << ", " << name << "-row-norms="
+       << SummarizeVector(row_norms_cpu);
+  }
+  if (include_column_norms) {
+    CuVector<BaseFloat> col_norms(params.NumCols());
+    col_norms.AddDiagMat2(1.0, params, kTrans, 0.0);
+    col_norms.ApplyPow(0.5);
+    Vector<BaseFloat> col_norms_cpu;
+    col_norms.Swap(&col_norms_cpu);
+    os << ", " << name << "-col-norms="
+       << SummarizeVector(col_norms_cpu);
+  }
+  if (include_singular_values) {
     Matrix<BaseFloat> params_cpu(params);
     Vector<BaseFloat> s(std::min(params.NumRows(), params.NumCols()));
     params_cpu.Svd(&s);
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index fef21301ff6..83e36d37c0b 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -189,8 +189,11 @@ std::string ErrorContext(std::istream &is);
 
 std::string ErrorContext(const std::string &str);
 
-// Returns a string that summarizes a vector fairly succintly, for
-// printing stats in info lines.
+/** Returns a string that summarizes a vector fairly succintly, for
+    printing stats in info lines.  For example:
+   "[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.003,0.003,0.004 \
+      0.005,0.01,0.07,0.11,0.14 0.18,0.24,0.29,0.39), mean=0.0745, stddev=0.0611]"
+*/
 std::string SummarizeVector(const Vector<BaseFloat> &vec);
 
 /** Print to 'os' some information about the mean and standard deviation of
@@ -213,13 +216,25 @@ void PrintParameterStats(std::ostringstream &os,
      PrintParameterStats(os, "linear-params", linear_params_;
     would print to 'os' something like the string
      ", linear-params-rms=0.239".
-    If you set include_mean to true, it will print something like
+    If you set 'include_mean' to true, it will print something like
     ", linear-params-{mean-stddev}=0.103,0.183".
+    If you set 'include_row_norms' to true, it will print something
+    like
+    ", linear-params-row-norms=[percentiles(0,1........, stddev=0.0508]"
+    If you set 'include_column_norms' to true, it will print something
+    like
+    ", linear-params-col-norms=[percentiles(0,1........, stddev=0.0508]"
+    If you set 'include_singular_values' to true, it will print something
+    like
+    ", linear-params-singular-values=[percentiles(0,1........, stddev=0.0508]"
  */
 void PrintParameterStats(std::ostringstream &os,
                          const std::string &name,
                          const CuMatrix<BaseFloat> &params,
-                         bool include_mean = false);
+                         bool include_mean = false,
+                         bool include_row_norms = false,
+                         bool include_column_norms = false,
+                         bool include_singular_values = false);
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index d6c4e2163bf..471e7d943d4 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1229,7 +1229,11 @@ void AffineComponent::PerturbParams(BaseFloat stddev) {
 std::string AffineComponent::Info() const {
   std::ostringstream stream;
   stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "linear-params", linear_params_);
+  PrintParameterStats(stream, "linear-params", linear_params_,
+                      false, // include_mean
+                      true, // include_row_norms
+                      true, // include_column_norms
+                      GetVerboseLevel() >= 2); // include_singular_values
   PrintParameterStats(stream, "bias", bias_params_, true);
   return stream.str();
 }
@@ -2100,12 +2104,6 @@ void PerElementScaleComponent::Backprop(
   PerElementScaleComponent *to_update =
       dynamic_cast<PerElementScaleComponent*>(to_update_in);
 
-  if (in_deriv) {
-    // Propagate the derivative back to the input.
-    in_deriv->CopyFromMat(out_deriv);
-    in_deriv->MulColsVec(scales_);
-  }
-
   if (to_update != NULL) {
     // Next update the model (must do this 2nd so the derivatives we propagate
     // are accurate, in case this == to_update_in.)
@@ -2114,6 +2112,13 @@ void PerElementScaleComponent::Backprop(
     else  // the call below is to a virtual function that may be re-implemented
       to_update->Update(debug_info, in_value, out_deriv);  // by child classes.
   }
+
+  if (in_deriv) {
+    // Propagate the derivative back to the input.
+    if (in_deriv->Data() != out_deriv.Data())
+      in_deriv->CopyFromMat(out_deriv);
+    in_deriv->MulColsVec(scales_);
+  }
 }
 
 void PerElementScaleComponent::Read(std::istream &is, bool binary) {
@@ -2968,9 +2973,7 @@ void NaturalGradientAffineComponent::Write(std::ostream &os,
 
 std::string NaturalGradientAffineComponent::Info() const {
   std::ostringstream stream;
-  stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "linear-params", linear_params_);
-  PrintParameterStats(stream, "bias", bias_params_, true);
+  stream << AffineComponent::Info();
   stream << ", rank-in=" << rank_in_
          << ", rank-out=" << rank_out_
          << ", num-samples-history=" << num_samples_history_
@@ -3072,6 +3075,12 @@ void LinearComponent::Read(std::istream &is, bool binary) {
   KALDI_ASSERT(token == "");
   ExpectToken(is, binary, "<Params>");
   params_.Read(is, binary);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "<UseNaturalGradient>");
   ReadBasicType(is, binary, &use_natural_gradient_);
 
@@ -3149,6 +3158,10 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   preconditioner_in_.SetUpdatePeriod(update_period);
   preconditioner_out_.SetUpdatePeriod(update_period);
 
+
+  orthonormal_constraint_ = 0.0;
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
@@ -3160,6 +3173,10 @@ void LinearComponent::Write(std::ostream &os,
   WriteUpdatableCommon(os, binary);  // Write the opening tag and learning rate
   WriteToken(os, binary, "<Params>");
   params_.Write(os, binary);
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "<UseNaturalGradient>");
   WriteBasicType(os, binary, use_natural_gradient_);
 
@@ -3183,7 +3200,13 @@ void LinearComponent::Write(std::ostream &os,
 std::string LinearComponent::Info() const {
   std::ostringstream stream;
   stream << UpdatableComponent::Info();
-  PrintParameterStats(stream, "params", params_);
+  PrintParameterStats(stream, "params", params_,
+                      false, // include_mean
+                      true, // include_row_norms
+                      true, // include_column_norms
+                      GetVerboseLevel() >= 2); // include_singular_values
+  if (orthonormal_constraint_ != 0.0)
+    stream << ", orthonormal-constraint=" << orthonormal_constraint_;
   stream << ", use-natural-gradient="
          << (use_natural_gradient_ ? "true" : "false")
          << ", rank-in=" << preconditioner_in_.GetRank()
@@ -3249,12 +3272,14 @@ LinearComponent::LinearComponent(
     const LinearComponent &other):
     UpdatableComponent(other),
     params_(other.params_),
+    orthonormal_constraint_(other.orthonormal_constraint_),
     use_natural_gradient_(other.use_natural_gradient_),
     preconditioner_in_(other.preconditioner_in_),
     preconditioner_out_(other.preconditioner_out_) { }
 
 LinearComponent::LinearComponent(const CuMatrix<BaseFloat> &params):
     params_(params),
+    orthonormal_constraint_(0.0),
     use_natural_gradient_(true) {
   // Set defaults for natural gradient.
   preconditioner_in_.SetRank(40);
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index d7cece06284..099d0c8fa2a 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -919,6 +919,16 @@ class NaturalGradientAffineComponent: public AffineComponent {
                            bias-stddev, bias-mean) to initialize the parameters.
                            Dimension is output-dim by (input-dim + 1), last
                            column is interpreted as the bias.
+    orthonormal-constraint=0.0   If you set this to 1.0, then
+                           this matrix will be (approximately) constrained during
+                           training to have orthonormal rows (or columns, whichever
+                           is fewer).  You can choose a positive nonzero value different
+                           than 1.0 to have a scaled orthonormal matrix, i.e. with singular
+                           values at the selected value (e.g. 0.5, or 2.0).
+                           This is not enforced inside the component
+                           itself; you have to call ConstrainOrthonormal()
+                           from the training code to do this.  All this component
+                           does is return the OrthonormalConstraint() value.
 
    Options to the natural gradient (you won't normally have to set these,
    the defaults are suitable):
@@ -982,14 +992,19 @@ class LinearComponent: public UpdatableComponent {
   explicit LinearComponent(const LinearComponent &other);
 
   explicit LinearComponent(const CuMatrix<BaseFloat> &params);
+
+  BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
+  CuMatrixBase<BaseFloat> &Params() { return params_; }
+  const CuMatrixBase<BaseFloat> &Params() const { return params_; }
  private:
 
   // disallow assignment operator.
   LinearComponent &operator= (
       const LinearComponent&);
 
-
   CuMatrix<BaseFloat> params_;
+
+  BaseFloat orthonormal_constraint_;
   // If true (and if no this->is_gradient_), use natural gradient updates.
   bool use_natural_gradient_;
   OnlineNaturalGradient preconditioner_in_;
@@ -1460,8 +1475,12 @@ class PermuteComponent: public Component {
 
 
 
-// PerElementScaleComponent scales each dimension of its input with a separate
-// trainable scale; it's like a linear component with a diagonal matrix.
+/**
+   PerElementScaleComponent scales each dimension of its input with a separate
+   trainable scale; it's like a linear component with a diagonal matrix.  This
+   version (and its child class NaturalGradientPerElementScaleComponent)
+   requires the input for backprop.  See also ScaleAndOffsetComponent.
+*/
 class PerElementScaleComponent: public UpdatableComponent {
  public:
   virtual int32 InputDim() const { return scales_.Dim(); }
@@ -1474,7 +1493,7 @@ class PerElementScaleComponent: public UpdatableComponent {
   virtual std::string Type() const { return "PerElementScaleComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|kBackpropNeedsInput|
-        kPropagateInPlace;
+        kPropagateInPlace|kBackpropInPlace;
   }
 
   virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -1686,8 +1705,7 @@ class ConstantFunctionComponent: public UpdatableComponent {
 
 
 // NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
-// it uses a natural gradient update for the per-element scales, and enforces a
-// maximum amount of change per minibatch, for stability.
+// it uses a natural gradient update for the per-element scales.
 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public:
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 30cd47b3eb2..b73df647392 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -111,6 +111,10 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
   // happens when we use the model with batchnorm test-mode set).
   ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_);
 
+  // The following will only do something if we have a LinearComponent
+  // with is-constrained-orthonormal set to true.
+  ConstrainOrthonormal(nnet_);
+
   // Scale deta_nnet
   if (success)
     ScaleNnet(config_.momentum, delta_nnet_);
@@ -158,6 +162,21 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
       max_change_scale, scale_adding, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
+  if (is_backstitch_step1) {
+    // The following will only do something if we have a LinearComponent
+    // with is-constrained-orthonormal set to true.  We choose to do this
+    // only on the 1st backstitch step, for efficiency.
+    ConstrainOrthonormal(nnet_);
+  }
+
+  if (!is_backstitch_step1) {
+    // Scale down the batchnorm stats (keeps them fresh... this affects what
+    // happens when we use the model with batchnorm test-mode set).  Do this
+    // after backstitch step 2 so that the stats are scaled down before we start
+    // the next minibatch.
+    ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_);
+  }
+
   ScaleNnet(0.0, delta_nnet_);
 }
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 64fc3003609..21b80c15732 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -859,6 +859,79 @@ class SvdApplier {
   std::string component_name_pattern_;
 };
 
+// Does an update that moves M closer to being a (matrix with
+// orthonormal rows) times 'scale'.  Note: this will diverge if
+// we start off with singular values too far from 'scale'.
+void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
+  // Larger alpha will update faster but will be more prone to instability.  I
+  // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or
+  // it will always be unstable.  It should be > 0.0.
+  // The factor of 1/scale^4 is, I *believe*, going to give us the right
+  // kind of invariance w.r.t. the scale.
+  BaseFloat alpha = 0.125 / (scale * scale * scale * scale);
+
+  // We're enforcing the rows to be orthonormal.
+  // define P = M M^T.  If P is unit then M has orthonormal rows.
+  // We actually want P to equal scale^2 * I, so that M's rows are
+  // orthogonal with 2-norms equal to 'scale'.
+  // We (notionally) add to the objective function, the value
+  // -alpha times the sum of squared elements of Q = (P- scale^2 * I).
+  int32 rows = M->NumRows(), cols = M->NumCols();
+  CuMatrix<BaseFloat> M_update(rows, cols);
+  CuMatrix<BaseFloat> P(rows, rows);
+  P.SymAddMat2(1.0, *M, kNoTrans, 0.0);
+  P.CopyLowerToUpper();
+  P.AddToDiag(-1.0 * scale * scale);
+
+  if (GetVerboseLevel() >= 1) {
+    BaseFloat error = P.FrobeniusNorm();
+    KALDI_VLOG(1) << "Error in orthogonality is " << error;
+  }
+
+  // At this point, the matrix P contains what, in the math, would be Q =
+  // P-scale^2*I.  The derivative of the objective function w.r.t. an element q(i,j)
+  // of Q is now equal to -2*alpha*q(i,j), i.e. we could write q_deriv(i,j)
+  // = -2*alpha*q(i,j) This is also the derivative of the objective function
+  // w.r.t. p(i,j): i.e. p_deriv(i,j) = -2*alpha*q(i,j).
+  // Suppose we have define this matrix as 'P_deriv'.
+  // The derivative of the objective w.r.t M equals
+  // 2 * P_deriv * M, which equals -4*alpha*(P-scale^2*I)*M.
+  // (Currently the matrix P contains what, in the math, is P-scale^2*I).
+  M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0);
+  M->AddMat(1.0, M_update);
+}
+
+/**
+   This function, to be called after processing every minibatch, is responsible
+   for enforcing the orthogonality constraint for any components of type
+   LinearComponent that have the "orthonormal_constraint" value set.
+ */
+void ConstrainOrthonormal(Nnet *nnet) {
+  for (int32 c = 0; c < nnet->NumComponents(); c++) {
+    Component *component = nnet->GetComponent(c);
+    LinearComponent *lc = dynamic_cast<LinearComponent*>(component);
+    if (lc == NULL || lc->OrthonormalConstraint() == 0.0)
+      continue;
+    if (RandInt(0, 3) != 0)
+      continue;  // For efficiency, only do this every 4 minibatches-- it won't
+                 // stray far.
+
+
+    BaseFloat scale = lc->OrthonormalConstraint();
+    KALDI_ASSERT(scale > 0.0);
+
+    CuMatrixBase<BaseFloat> &params = lc->Params();
+    int32 rows = params.NumRows(), cols = params.NumCols();
+    if (rows <= cols) {
+      ConstrainOrthonormalInternal(scale, &params);
+    } else {
+      CuMatrix<BaseFloat> params_trans(params, kTrans);
+      ConstrainOrthonormalInternal(scale, &params_trans);
+      params.CopyFromMat(params_trans, kTrans);
+    }
+  }
+}
+
 
 // This code has been broken out of ReadEditConfig as it's quite long.
 // It implements the internals of the edit directive 'reduce-rank'.
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index d961b7cb6a0..b3dace8686f 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -251,7 +251,6 @@ struct CollapseModelConfig {
 void CollapseModel(const CollapseModelConfig &config,
                    Nnet *nnet);
 
-
 /**
    ReadEditConfig() reads a file with a similar-looking format to the config file
    read by Nnet::ReadConfig(), but this consists of a sequence of operations to
@@ -452,6 +451,17 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
                          Nnet *nnet);
 
 
+/**
+   This function, to be called after processing every minibatch, is responsible
+   for enforcing the orthogonality constraint for any components of type
+   LinearComponent that have the "orthonormal-constraint" value set to nonzero.
+
+   In order to make it efficient on GPU, it doesn't make it completely orthonormal,
+   it just makes it closer to being orthonormal (times the 'orthonormal_constraint'
+   value).  Over multiple iterations this rapidly makes it almost exactly orthonormal.
+ */
+void ConstrainOrthonormal(Nnet *nnet);
+
 /** This utility function can be used to obtain the number of distinct 'n'
     values in a training example.  This is the number of examples
     (e.g. sequences) that have been combined into a single example.  (Actually
diff --git a/src/nnet3bin/nnet3-show-progress.cc b/src/nnet3bin/nnet3-show-progress.cc
index 7e937f0c211..25a65dbed5c 100644
--- a/src/nnet3bin/nnet3-show-progress.cc
+++ b/src/nnet3bin/nnet3-show-progress.cc
@@ -132,6 +132,10 @@ int main(int argc, char *argv[]) {
     { // Get info about magnitude of parameter change.
       Nnet diff_nnet(nnet1);
       AddNnet(nnet2, -1.0, &diff_nnet);
+      if (GetVerboseLevel() >= 1) {
+        KALDI_VLOG(1) << "Printing info for the difference between the neural nets: "
+                      << diff_nnet.Info();
+      }
       int32 num_updatable = NumUpdatableComponents(diff_nnet);
       Vector<BaseFloat> dot_prod(num_updatable);
       ComponentDotProducts(diff_nnet, diff_nnet, &dot_prod);
@@ -139,12 +143,15 @@ int main(int argc, char *argv[]) {
       KALDI_LOG << "Parameter differences per layer are "
                 << PrintVectorPerUpdatableComponent(nnet1, dot_prod);
 
-      Vector<BaseFloat> baseline_prod(num_updatable);
+      Vector<BaseFloat> baseline_prod(num_updatable),
+          new_prod(num_updatable);
       ComponentDotProducts(nnet1, nnet1, &baseline_prod);
+      ComponentDotProducts(nnet2, nnet2, &new_prod);
       baseline_prod.ApplyPow(0.5);
+      new_prod.ApplyPow(0.5);
 
-      KALDI_LOG << "Norms of parameter matrices are "
-                << PrintVectorPerUpdatableComponent(nnet1, baseline_prod);
+      KALDI_LOG << "Norms of parameter matrices from <new-nnet-in> are "
+                << PrintVectorPerUpdatableComponent(nnet2, new_prod);
 
       dot_prod.DivElements(baseline_prod);
       KALDI_LOG << "Relative parameter differences per layer are "

From f4866afebac0db087cc4d526547cda1bb3fa053e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 15 Dec 2017 21:53:30 -0500
Subject: [PATCH 022/184] [scripts] Some changes to lstm.py which I don't want
 to keep all of, but backing up via git.

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 112 ++++++++++++++++----
 1 file changed, 90 insertions(+), 22 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 96f63537a55..208239262b6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -609,12 +609,18 @@ def set_default_configs(self):
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
+                        # recurrence-scale is a scale we put on the c_t when doing linear projections
+                        # from it... making it larger than 1 (e.g. 4) helps equalize scales.
+                        'recurrence-scale': 1.0,
                         'delay' : -1,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
                         # add 'self-repair-scale=xxx' to
                         # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # if self-stabilize=true, the W_all will be a
+                        # LinearComponent followed by a ScaleAndOffsetComponent.
+                         'self-stabilize': False,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
@@ -712,9 +718,19 @@ def _generate_lstm_config(self):
         # providing output to gate i and operating on an appended vector [x,r]
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
-        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
-                                                       affine_str, l2_regularize_option))
+
+        if not self.config['self-stabilize']:
+            configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
+        else:
+            configs.append("component name={0}.W_all type=LinearComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
+            configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} "
+                           "max-change=0.75".format(name, cell_dim * 4))
+
+
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
@@ -729,10 +745,18 @@ def _generate_lstm_config(self):
         configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
-        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
+                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
+                          name, input_descriptor, self.config['recurrence-scale'], delay))
+        if self.config['self-stabilize']:
+            configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
+            W_all_name = 'W_all_so'
+        else:
+            W_all_name = 'W_all'
+
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+                       "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
+                           name, W_all_name, delay))
         # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
         #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
@@ -796,7 +820,11 @@ def set_default_configs(self):
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
+                       'recurrence-scale': 4.0,
                         'delay' : -1,
+                        # if self-stabilize=true, the W_all_b will be a
+                        # LinearComponent followed by a ScaleAndOffsetComponent.
+                         'self-stabilize': False,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
                         # add 'self-repair-scale=xxx' to
@@ -900,8 +928,9 @@ def _generate_lstm_config(self):
         # This differs from that code by a factorization of the W_all matrix.
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, bottleneck_dim,
-                                                       affine_str, l2_regularize_option))
+                       "orthonormal-constraint=1.0 output-dim={2} {3} {4}".format(
+                           name, input_dim + cell_dim, bottleneck_dim,
+                           affine_str, l2_regularize_option))
         normalize_type = self.config['normalize-type']
         if normalize_type == 'batchnorm':
             configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format(
@@ -910,9 +939,17 @@ def _generate_lstm_config(self):
             configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format(
                 name, bottleneck_dim))
 
-        configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
-                                                       affine_str, l2_regularize_option))
+        if not self.config['self-stabilize']:
+            configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
+        else:
+            configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
+            configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} "
+                           "max-change=0.75".format(name, cell_dim * 4))
+
 
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
@@ -929,7 +966,9 @@ def _generate_lstm_config(self):
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+#                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
+                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
+                           name, input_descriptor, self.config['recurrence-scale'], delay))
         if normalize_type != 'none':
             configs.append("component-node name={0}.W_{1} component={0}.W_{1} "
                            "input={0}.W_all_a".format(name,
@@ -939,8 +978,16 @@ def _generate_lstm_config(self):
         else:
             configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                            "input={0}.W_all_a".format(name))
+        if self.config['self-stabilize']:
+            configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
+                           "input={0}.W_all_b".format(name))
+            W_all_b_name = 'W_all_b_so'
+        else:
+            W_all_b_name = 'W_all_b'
+
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.W_all_b, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+                       "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
+                           name, W_all_b_name, delay))
         # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
         #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
@@ -1016,6 +1063,11 @@ def set_default_configs(self):
                         # add 'self-repair-scale=xxx' to
                         # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # If you set 'self-stabilize=true', for W_all_a, instead
+                        # of a NaturalGradientAffineComponent, it has a LinearComponent followed
+                        # by a ScaleAndOffsetComponent.  This is similar to
+                        # "SELF-STABILIZED DEEP NEURAL NETWORK" by Ghahremani and Droppo.
+                        'self-stabilize': False,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
@@ -1136,9 +1188,18 @@ def _generate_lstm_config(self):
         # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
         configs.append("##  Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
-        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
-                                                       affine_str, l2_regularize_option))
+        if self.config['self-stabilize']:
+            # have LinearComponent followed by ScaleAndOffsetComponent.
+            configs.append("component name={0}.W_all type=LinearComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
+            configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} "
+                           "max-change=0.75".format(name, cell_dim * 4))
+        else:
+            # have NaturalGradientAffineComponent
+            configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                           "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
+                                                           affine_str, l2_regularize_option))
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
@@ -1156,24 +1217,31 @@ def _generate_lstm_config(self):
                            .format(name, dropout_proportion))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
-        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent input-dim={1} "
-                       "output-dim={2} {3} {4}".format(
+        configs.append("component name={0}.W_rp type=LinearComponent orthonormal-constraint=2.0 "
+                       "input-dim={1} output-dim={2} {3} {4}".format(
                            name, cell_dim, rec_proj_dim + nonrec_proj_dim,
                            affine_str, l2_regularize_option))
         configs.append("###  Nodes for the components above.")
-        configs.append("component-node name={0}.four_parts component={0}.W_all input=Append({1}, "
+        configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                        "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
+        if self.config['self-stabilize']:
+            configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
+            W_all_name = 'W_all_so'
+        else:
+            W_all_name = 'W_all'
+
         if dropout_proportion != -1.0:
             # note: the 'input' is a don't-care as the component never uses it; it's required
             # in component-node lines.
             configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask "
                            "input={0}.dropout_mask".format(name))
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})), {0}.dropout_mask)"
-                           .format(name, delay))
+                           "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})), {0}.dropout_mask)"
+                           .format(name, W_all_name, delay))
         else:
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.four_parts, IfDefined(Offset({0}.c_trunc, {1})))".format(name, delay))
+                           "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
+                               name, W_all_name, delay))
         configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "

From 2fd01ee65c1299f711b7f17422d6169e5622bb34 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 16 Dec 2017 01:33:28 -0500
Subject: [PATCH 023/184] [src] Fix bug in LinearComponent whereby
 use-natural-gradient defaulted to false.

---
 src/nnet3/nnet-simple-component.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 608f3284885..d7957f24102 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2969,12 +2969,14 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   int32 rank_in = 20, rank_out = 80, update_period = 4;
   BaseFloat alpha = 4.0,
       num_samples_history = 2000.0;
+  use_natural_gradient_ = true;
 
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
   cfl->GetValue("rank-in", &rank_in);
   cfl->GetValue("rank-out", &rank_out);
   cfl->GetValue("update-period", &update_period);
+  cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
 
   preconditioner_in_.SetAlpha(alpha);
   preconditioner_out_.SetAlpha(alpha);

From 407f239449c9152cfb81ff57012cee2cceb16a04 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 16 Dec 2017 01:33:42 -0500
Subject: [PATCH 024/184] [src] Some refactoring of lstmb layer: using memnorm.

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 148 +++++++-----------
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   3 +-
 2 files changed, 54 insertions(+), 97 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 208239262b6..5827ea4d179 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -737,12 +737,13 @@ def _generate_lstm_config(self):
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
-        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         # Note from Dan: I don't remember why we are applying the backprop
         # truncation on both c and m appended together, instead of just on c.
         # Possibly there was some memory or speed or WER reason for it which I
         # have forgotten about now.
-        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
+        configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
+                       "{2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
@@ -775,12 +776,12 @@ def _generate_lstm_config(self):
 
 
 # This class is for lines like
-#   'fast-lstmb-layer name=lstm1 input=[-1] delay=-3'
-# (you can also call it 'fast-lstmb-batchnorm-layer' if you want it to end
-# in a batchnorm component).
+#   'lstmb-layer name=lstm1 input=[-1] delay=-3'
+#
+# TODO: more description
 # It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix
 # of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide
-# it into two matrices, with batch-norm in between to stabilize the training.
+# it into two matrices, with an orbatch-norm in between to stabilize the training.
 #
 # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
 # the dimension defaults to the same as the input.
@@ -808,32 +809,23 @@ def _generate_lstm_config(self):
 #                            i.e. history since about t = t-20, can be
 #                            accumulated in c_t.]
 #  l2-regularize=0.0         Constant controlling l2 regularization for this layer
-class XconfigFastLstmbLayer(XconfigLayerBase):
+class XconfigLstmbLayer(XconfigLayerBase):
     def __init__(self, first_token, key_to_value, prev_names = None):
-        assert first_token in [ 'fast-lstmb-layer', 'fast-lstmb-batchnorm-layer' ]
+        assert first_token == 'lstmb-layer'
         XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
 
     def set_default_configs(self):
-        self.config = {'input':'[-1]',
+        self.config = { 'input':'[-1]',
                         'cell-dim' : -1, # this is a required argument
                         'bottleneck-dim': -1, # this is a required argument
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                       'recurrence-scale': 4.0,
                         'delay' : -1,
-                        # if self-stabilize=true, the W_all_b will be a
-                        # LinearComponent followed by a ScaleAndOffsetComponent.
-                         'self-stabilize': False,
-                        # if you want to set 'self-repair-scale' (c.f. the
-                        # self-repair-scale-nonlinearity config value in older LSTM layers), you can
-                        # add 'self-repair-scale=xxx' to
-                        # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
-                        'normalize-type': 'batchnorm', # can be 'batchnorm', 'renorm', or 'none'
                         'l2-regularize': 0.0,
                         'decay-time':  -1.0
                         }
@@ -852,30 +844,16 @@ def check_configs(self):
                 self.config['bottleneck-dim']))
         if self.config['delay'] == 0:
             raise RuntimeError("delay cannot be zero")
-        assert self.config['normalize-type'] in ['batchnorm', 'renorm', 'none']
 
     def auxiliary_outputs(self):
-        return ['c']
+        return []
 
     def output_name(self, auxiliary_output = None):
-        node_name = ('m_batchnorm' if self.layer_type == 'fast-lstmb-batchnorm-layer'
-                      else 'm')
-        if auxiliary_output is not None:
-            if auxiliary_output == 'c':
-                node_name = 'c'
-                self.c_needed = True
-            else:
-                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
-        return '{0}.{1}'.format(self.name, node_name)
+        assert auxiliary_output is None
+        return '{0}.m_batchnorm'.format(self.name)
 
     def output_dim(self, auxiliary_output = None):
-        if auxiliary_output is not None:
-            if auxiliary_output == 'c':
-                self.c_needed = True
-                return self.config['cell-dim']
-                # add code for other auxiliary_outputs here when we decide to expose them
-            else:
-                raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output))
+        assert auxiliary_output is None
         return self.config['cell-dim']
 
     def get_full_config(self):
@@ -924,31 +902,26 @@ def _generate_lstm_config(self):
 
         configs = []
 
-        # See XconfigFastLstmLayer to understand what's going on here.
-        # This differs from that code by a factorization of the W_all matrix.
+        # See XconfigFastLstmLayer to understand what's going on here.  This
+        # differs from that code by a factorization of the W_all matrix into two
+        # pieces with a smaller dimension in between (with the first of the two
+        # pieces constrained to have orthonormal rows).  Note: we don't apply l2
+        # regularization to this layer, since, with the orthonormality
+        # constraint, it's meaningless.
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
-                       "orthonormal-constraint=1.0 output-dim={2} {3} {4}".format(
+                       "orthonormal-constraint=1.0 output-dim={2} {3} ".format(
                            name, input_dim + cell_dim, bottleneck_dim,
-                           affine_str, l2_regularize_option))
-        normalize_type = self.config['normalize-type']
-        if normalize_type == 'batchnorm':
-            configs.append("component name={0}.W_batchnorm type=BatchNormComponent dim={1} ".format(
-                name, bottleneck_dim))
-        elif normalize_type == 'renorm':
-            configs.append("component name={0}.W_renorm type=NormalizeComponent dim={1} ".format(
-                name, bottleneck_dim))
+                           affine_str))
 
-        if not self.config['self-stabilize']:
-            configs.append("component name={0}.W_all_b type=NaturalGradientAffineComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
-        else:
-            configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
-            configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} "
-                           "max-change=0.75".format(name, cell_dim * 4))
+        configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
+                name, cell_dim))
+
+        configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
+                       "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
+                                                       affine_str, l2_regularize_option))
+        configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} "
+                       "max-change=0.75".format(name, cell_dim * 4))
 
 
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
@@ -958,49 +931,34 @@ def _generate_lstm_config(self):
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
-        # Note from Dan: I don't remember why we are applying the backprop
-        # truncation on both c and m appended together, instead of just on c.
-        # Possibly there was some memory or speed or WER reason for it which I
-        # have forgotten about now.
-        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(name, 2 * cell_dim, bptrunc_str))
+
+        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(
+            name, cell_dim, bptrunc_str))
+        configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+            name, cell_dim))
+
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-#                       "IfDefined(Offset({0}.c_trunc, {2})))".format(name, input_descriptor, delay))
-                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
-                           name, input_descriptor, self.config['recurrence-scale'], delay))
-        if normalize_type != 'none':
-            configs.append("component-node name={0}.W_{1} component={0}.W_{1} "
-                           "input={0}.W_all_a".format(name,
-                                                      normalize_type))
-            configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
-                           "input={0}.W_{1}".format(name, normalize_type))
-        else:
-            configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
-                           "input={0}.W_all_a".format(name))
-        if self.config['self-stabilize']:
-            configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
-                           "input={0}.W_all_b".format(name))
-            W_all_b_name = 'W_all_b_so'
-        else:
-            W_all_b_name = 'W_all_b'
+                       "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format(
+                           name, input_descriptor, delay))
+        configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
+                       "input={0}.W_all_a".format(name))
+        configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
+                       "input={0}.W_all_b".format(name))
 
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
-                           name, W_all_b_name, delay))
-        # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
-        #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
-        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
-        # configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
-
-        if self.layer_type == "fast-lstmb-batchnorm-layer":
-            # Add the batchnorm component, if requested to include batchnorm.
-            configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
-                name, cell_dim))
-            configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
-                           "input={0}.m".format(name))
+                       "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                           name, delay))
+        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name))
+        configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm "
+                       "input={0}.c_trunc".format(name))
+        configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
+                       "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
         return configs
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 0fff379cf31..db9550818cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -40,8 +40,7 @@
         'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer,
         'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer,
         'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer,
-        'fast-lstmb-layer' : xlayers.XconfigFastLstmbLayer,
-        'fast-lstmb-batchnorm-layer' : xlayers.XconfigFastLstmbLayer,
+        'lstmb-layer' : xlayers.XconfigLstmbLayer,
         'stats-layer': xlayers.XconfigStatsLayer,
         'relu-conv-layer': xlayers.XconfigConvLayer,
         'conv-layer': xlayers.XconfigConvLayer,

From 76250980ec873327c34e93b146c4f2f1ab3336ba Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 18 Dec 2017 17:44:17 -0500
Subject: [PATCH 025/184] [scripts] Cosmetic change: add message

---
 egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 2 ++
 egs/wsj/s5/steps/nnet3/get_egs.sh       | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
index cec6f8e166f..0294df0d84a 100755
--- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh
@@ -150,6 +150,8 @@ if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
   rm $dir/uniq2utt $dir/valid_uttlist.tmp
 fi
 
+echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
+
 cat $data/utt2dur | \
   awk -v min_len=$frames_per_eg -v fs=$frame_shift '{if ($2 * 1/fs >= min_len) print $1}' | \
    utils/filter_scp.pl --exclude $dir/valid_uttlist | \
diff --git a/egs/wsj/s5/steps/nnet3/get_egs.sh b/egs/wsj/s5/steps/nnet3/get_egs.sh
index a6dd9682616..c8cbf67c8b8 100755
--- a/egs/wsj/s5/steps/nnet3/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/get_egs.sh
@@ -138,6 +138,8 @@ awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlis
 
 [ -z "$transform_dir" ] && transform_dir=$alidir
 
+echo "$0: creating egs.  To ensure they are not deleted later you can do:  touch $dir/.nodelete"
+
 # because we'll need the features with a different number of jobs than $alidir,
 # copy to ark,scp.
 if [ -f $transform_dir/raw_trans.1 ]; then

From e652f0eb7891e77eea13803118467a887d07761e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 18 Dec 2017 18:08:15 -0500
Subject: [PATCH 026/184] [src] Fix optimization bug in nnet3, regarding
 Scale() expressions>

---
 src/nnet3/nnet-optimize-utils.cc | 5 ++---
 src/nnet3/nnet-optimize.cc       | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index e3c5edba565..d28626fee86 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -737,9 +737,7 @@ bool VariableMergingOptimizer::MergeVariables() {
     // potentially merge into a single variable.
     const NnetComputation::Command &c = computation_->commands[command_index];
     int32 s1 = -1, s2 = -1;
-    // TODO: add kScale command and remove the check for 1.0
     if (c.command_type == kMatrixCopy &&
-        //        c.alpha == 1.0 &&
         config_.remove_assignments) {
       s2 = c.arg1;  // s2 is the written-to matrix.
       s1 = c.arg2;
@@ -997,7 +995,8 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
   if (!left && !right)  // save some time.
     return std::pair<bool,bool>(false,false);
   bool is_assignment = (computation_->commands[command_index].command_type ==
-                        kMatrixCopy);
+                        kMatrixCopy &&
+                        computation_->commands[command_index].alpha == 1.0);
   ComputationAnalysis analysis(*computation_, analyzer_);
   if (is_assignment) {
     if (analysis.FirstNontrivialAccess(s2) == command_index &&
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 3b2fe6b5b2f..212f707aefc 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -439,7 +439,7 @@ void ConvertAdditionToAssignment(const Nnet &nnet,
             case kMatrixAdd: c.command_type = kMatrixCopy;
               break;
             case kAddRows: c.command_type = kCopyRows;
-              break;
+               break;
             case kAddRowsMulti: c.command_type = kCopyRowsMulti;
               break;
             // note: kCopyToRowsMulti does not currently support alpha != 1.0.

From b6e17dd9dfec9f9d80e6e1178fa7a91dc4d1043c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 21 Dec 2017 14:46:48 -0800
Subject: [PATCH 027/184] [src] Simplify/refactor natural gradient code in
 nnet3

---
 src/nnet3/natural-gradient-online-test.cc |  5 +-
 src/nnet3/natural-gradient-online.cc      | 59 +++++++----------------
 src/nnet3/natural-gradient-online.h       | 55 ++++++++++++++-------
 src/nnet3/nnet-convolutional-component.cc |  7 +--
 src/nnet3/nnet-general-component.cc       |  2 +-
 src/nnet3/nnet-optimize-test.cc           |  2 +-
 src/nnet3/nnet-simple-component.cc        | 22 ++++-----
 7 files changed, 71 insertions(+), 81 deletions(-)

diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index 7c46dfb3596..2829d4ebde7 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -270,7 +270,7 @@ void UnitTestPreconditionDirectionsOnline() {
   if (Rand() % 3 == 0) zero = true;
   //else if (Rand() % 2 == 0) one = true;
 
-  CuVector<BaseFloat> row_prod1(N), row_prod2(N);
+  CuVector<BaseFloat> row_prod1(N);
   BaseFloat gamma1, gamma2;
   BaseFloat big_eig_factor = RandInt(1, 20);
   big_eig_factor = big_eig_factor * big_eig_factor;
@@ -300,14 +300,13 @@ void UnitTestPreconditionDirectionsOnline() {
 
     preconditioner1.PreconditionDirections(&Mcopy1, &row_prod1, &gamma1);
 
-    preconditioner2.PreconditionDirections(&Mcopy2, &row_prod2, &gamma2);
+    preconditioner2.PreconditionDirections(&Mcopy2, &gamma2);
 
     BaseFloat trace1 = TraceMatMat(M, M, kTrans),
         trace2 = TraceMatMat(Mcopy1, Mcopy1, kTrans);
     AssertEqual(trace1, trace2 * gamma2 * gamma2, 1.0e-02);
 
     AssertEqual(Mcopy1, Mcopy2);
-    AssertEqual<BaseFloat>(row_prod1, row_prod2, 1.0e-02);
     AssertEqual(gamma1, gamma2, 1.0e-02);
 
     // make sure positive definite
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index cf0311449db..9c9652559de 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -146,7 +146,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   for (int32 i = 0; i < num_init_iters; i++) {
     BaseFloat scale;
     R0_copy.CopyFromMat(R0);
-    this_copy.PreconditionDirections(&R0_copy, NULL, &scale);
+    this_copy.PreconditionDirections(&R0_copy, &scale);
   }
   rank_ = this_copy.rank_;
   W_t_.Swap(&this_copy.W_t_);
@@ -157,21 +157,13 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
 
 void OnlineNaturalGradient::PreconditionDirections(
     CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
     BaseFloat *scale) {
   if (X_t->NumCols() == 1) {
     // If the dimension of the space equals one then our natural gradient update
     // with rescaling becomes a no-op, but the code wouldn't naturally handle it
     // because rank would be zero.  Support this as a special case.
-    if (row_prod)
-      row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    *scale = 1.0;
-    return;
-  }
-
-  if (row_prod == NULL) {
-    CuVector<BaseFloat> row_prod_tmp(X_t->NumRows());
-    PreconditionDirections(X_t, &row_prod_tmp, scale);
+    if (scale)
+      *scale = 1.0;
     return;
   }
 
@@ -191,7 +183,17 @@ void OnlineNaturalGradient::PreconditionDirections(
   BaseFloat rho_t(rho_t_);
   Vector<BaseFloat> d_t(d_t_);
   read_write_mutex_.unlock();
-  PreconditionDirectionsInternal(t, rho_t, d_t, &WJKL_t, X_t, row_prod, scale);
+
+  BaseFloat initial_product = TraceMatMat(*X_t, *X_t, kTrans);
+  PreconditionDirectionsInternal(t, rho_t, initial_product, d_t, &WJKL_t, X_t);
+  if (scale) {
+    if (initial_product <= 0.0) {
+      *scale = 1.0;
+    } else {
+      BaseFloat final_product = TraceMatMat(*X_t, *X_t, kTrans);
+      *scale = sqrt(initial_product / final_product);
+    }
+  }
 }
 
 void OnlineNaturalGradient::ReorthogonalizeXt1(
@@ -320,11 +322,10 @@ void OnlineNaturalGradient::SelfTest() const {
 void OnlineNaturalGradient::PreconditionDirectionsInternal(
     const int32 t,
     const BaseFloat rho_t,
+    const BaseFloat tr_X_Xt,
     const Vector<BaseFloat> &d_t,
     CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *X_t,
-    CuVectorBase<BaseFloat> *row_prod,
-    BaseFloat *scale) {
+    CuMatrixBase<BaseFloat> *X_t) {
   int32 N = X_t->NumRows(),  // Minibatch size.
       D = X_t->NumCols(),  // Dimensions of vectors we're preconditioning
       R = rank_;  // Rank of correction to unit matrix.
@@ -383,17 +384,8 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
     if (!frozen_)
       num_updates_skipped_++;
 
-    BaseFloat tr_Xt_XtT = TraceMatMat(*X_t, *X_t, kTrans);
     // X_hat_t = X_t - H_t W_t
     X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
-    // each element i of row_prod will be inner product of row i of X_hat_t with
-    // itself.
-    row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-    BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-    KALDI_ASSERT(tr_Xhat_XhatT == tr_Xhat_XhatT);  // Check for NaN.
-    BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                         sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-    *scale = gamma_t;
     return;
   }
   J_t.AddMatMat(1.0, H_t, kTrans, *X_t, kNoTrans, 0.0);  // J_t = H_t^T X_t
@@ -456,31 +448,14 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   if (nf > 0 && self_debug_) {
     KALDI_WARN << "Floored " << nf << " elements of C_t.";
   }
-  BaseFloat tr_Xt_XtT_check;
-  if (self_debug_)
-    tr_Xt_XtT_check = TraceMatMat(*X_t, *X_t, kTrans);
 
   X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);  // X_hat_t = X_t - H_t W_t
-  // set *row_prod to inner products of each row of X_hat_t with itself.
-  row_prod->AddDiagMat2(1.0, *X_t, kNoTrans, 0.0);
-
-  BaseFloat tr_Xhat_XhatT = row_prod->Sum();
-  //  tr(X_t X_t^T) = tr(X_hat_t X_hat_t^T) - tr(L_t E_t) + 2 tr(L_t)
-  double tr_Xt_XtT = tr_Xhat_XhatT;
-  for (int32 i = 0; i < R; i++)
-    tr_Xt_XtT += L_t_cpu(i, i) * (2.0 - e_t(i));
-  if (self_debug_) {
-    KALDI_ASSERT(ApproxEqual(tr_Xt_XtT, tr_Xt_XtT_check));
-  }
-  BaseFloat gamma_t = (tr_Xhat_XhatT == 0.0 ? 1.0 :
-                       sqrt(tr_Xt_XtT / tr_Xhat_XhatT));
-  *scale = gamma_t;
 
   Vector<BaseFloat> sqrt_c_t(c_t);
   sqrt_c_t.ApplyPow(0.5);
 
   // \rho_{t+1} = 1/(D - R) (\eta/N tr(X_t X_t^T) + (1-\eta)(D \rho_t + tr(D_t)) - tr(C_t^{0.5})).
-  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_Xt_XtT
+  BaseFloat rho_t1 = 1.0 / (D - R) * (eta / N * tr_X_Xt
                                       + (1-eta)*(D * rho_t + d_t.Sum())
                                       - sqrt_c_t.Sum());
   // D_{t+1} = C_t^{0.5} - \rho_{t+1} I
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index 67c25eb0dbc..0c43c7f5c46 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -437,33 +437,48 @@ class OnlineNaturalGradient {
   // see comment where 'frozen_' is declared.
   inline void Freeze(bool frozen) { frozen_ = frozen; }
 
-  // The "R" pointer is both the input (R in the comment) and the output (P in
-  // the comment; equal to the preconditioned directions before scaling by
-  // gamma).  If the pointer "row_prod" is supplied, it's set to the inner product
-  // of each row of the preconditioned directions P, at output, with itself.
-  // You would need to apply "scale" to R and "scale * scale" to row_prod, to
-  // get the preconditioned directions; we don't do this ourselves, in order to
-  // save CUDA calls.
+  /**
+     This call implements the main functionality of this class.
+
+     @param [in,out] R  The "R" pointer is both the input (R in the
+            comment, X in the paper), and the output (P in the comment,
+            X with a hat on it in the paper).  Each row of R is viewed
+            as a vector in some space, where we're estimating a smoothed
+            Fisher matrix and then multiplying by the inverse of that
+            smoothed Fisher matrix.
+
+    @param [out] scale  If non-NULL, a scaling factor is written to here,
+            and the output 'R' should be multiplied by this factor by
+            the user (we don't do it internally, to save an operation).
+            The factor is chosen so that the vector 2-norm of R is the
+            same after the natural gradient as it was before.  (The pointer
+            being NULL or non-NULL doesn't affect the magnitude of R;
+            in any case the user will probably want to do this rescaling,
+            the question being whether they want to do so manually or
+            not.
+
+  */
   void PreconditionDirections(CuMatrixBase<BaseFloat> *R,
-                              CuVectorBase<BaseFloat> *row_prod,
                               BaseFloat *scale);
 
+
+
   // Copy constructor.
   explicit OnlineNaturalGradient(const OnlineNaturalGradient &other);
   // Assignent operator
   OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other);
  private:
 
-  // This does the work of PreconditionDirections (the top-level
-  // function handles some multithreading issues and then calls this function).
+
+  // This is an internal function called from PreconditionDirections(),
+  // which handles some multithreading issues and then calls this function.
   // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ].
   void PreconditionDirectionsInternal(const int32 t,
                                       const BaseFloat rho_t,
+                                      const BaseFloat tr_X_Xt,
                                       const Vector<BaseFloat> &d_t,
                                       CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *X_t,
-                                      CuVectorBase<BaseFloat> *row_prod,
-                                      BaseFloat *scale);
+                                      CuMatrixBase<BaseFloat> *X_t);
 
   void ComputeEt(const VectorBase<BaseFloat> &d_t,
                  BaseFloat beta_t,
@@ -512,10 +527,14 @@ class OnlineNaturalGradient {
   // or columns.
   static void InitOrthonormalSpecial(CuMatrixBase<BaseFloat> *R);
 
-  // Returns the learning rate eta as the function of the number of samples
-  // (actually, N is the number of vectors we're preconditioning, which due to
-  // context is not always exactly the same as the number of samples).  The
-  // value returned depends on num_samples_history_.
+  // Returns the value eta (with 0 < eta < 1) which reflects how fast we update
+  // the estimate of the Fisher matrix (larger == faster).  This is a function
+  // rather than a constant because we set this indirectly, via
+  // num_samples_history_ or num_minibatches_history_.  The argument N is the
+  // number of vectors we're preconditioning, which is the number of rows in the
+  // argument R to PreconditionDirections(); you can think of it as the number
+  // of vectors we're preconditioning (and in the common case it's some multiple
+  // of the minibatch size)
   BaseFloat Eta(int32 N) const;
 
   // called if self_debug_ = true, makes sure the members satisfy certain
@@ -593,13 +612,13 @@ class OnlineNaturalGradient {
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
 
-
   // Used to prevent parameters being read or written in an inconsistent state.
   std::mutex read_write_mutex_;
 
   // This mutex is used to control which thread gets to update the
   // parameters, in multi-threaded code.
   std::mutex update_mutex_;
+
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc
index 333d7a79cfa..bea3b9d31d5 100644
--- a/src/nnet3/nnet-convolutional-component.cc
+++ b/src/nnet3/nnet-convolutional-component.cc
@@ -389,14 +389,11 @@ void TimeHeightConvolutionComponent::UpdateNaturalGradient(
   // scalars are different across iterations, the scalars
   // will be pretty similar on different iterations
   BaseFloat scale1, scale2;
-  preconditioner_in_.PreconditionDirections(&params_deriv, NULL,
-                                            &scale1);
+  preconditioner_in_.PreconditionDirections(&params_deriv, &scale1);
 
 
   CuMatrix<BaseFloat> params_deriv_transpose(params_deriv, kTrans);
-  preconditioner_out_.PreconditionDirections(&params_deriv_transpose,
-                                             NULL, &scale2);
-
+  preconditioner_out_.PreconditionDirections(&params_deriv_transpose, &scale2);
 
   linear_params_.AddMat(
       learning_rate_ * scale1 * scale2,
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index bfb972f8735..bc7405f2836 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1251,7 +1251,7 @@ void ConstantComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-                                                          NULL, &scale);
+                                                          &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
diff --git a/src/nnet3/nnet-optimize-test.cc b/src/nnet3/nnet-optimize-test.cc
index bcb02184720..35614d62b34 100644
--- a/src/nnet3/nnet-optimize-test.cc
+++ b/src/nnet3/nnet-optimize-test.cc
@@ -143,7 +143,7 @@ static bool UnitTestNnetOptimizeWithOptions(int32 srand_seed,
   KALDI_LOG << "Output sum (optimized) is " << output_opt.Sum();
   if (!ApproxEqual(output, output_opt)) {
     KALDI_WARN << "Non-optimized and optimized versions of the computation give "
-               << "different outputs.";
+               << "different outputs: " << output << " vs. " << output_opt;
     return false;
   }
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index d7957f24102..f2019849117 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1516,7 +1516,7 @@ void NaturalGradientRepeatedAffineComponent::Update(
     try {
       // Only apply the preconditioning/natural-gradient if we're not computing
       // the exact gradient.
-      preconditioner_in_.PreconditionDirections(&deriv, NULL, &scale);
+      preconditioner_in_.PreconditionDirections(&deriv, &scale);
     } catch (...) {
       int32 num_bad_rows = 0;
       for (int32 i = 0; i < out_deriv.NumRows(); i++) {
@@ -2132,7 +2132,7 @@ void PerElementOffsetComponent::Backprop(
       // this scenario)
       CuMatrix<BaseFloat> out_deriv_copy(out_deriv_reshaped);
       BaseFloat scale = 1.0;
-      to_update->preconditioner_.PreconditionDirections(&out_deriv_copy, NULL,
+      to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
                                                         &scale);
       to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
                                        out_deriv_copy);
@@ -2417,7 +2417,7 @@ void ScaleAndOffsetComponent::BackpropInternal(
       BaseFloat scale = 1.0;
       CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
       to_update->offset_preconditioner_.PreconditionDirections(
-          &out_deriv_copy, NULL, &scale);
+          &out_deriv_copy, &scale);
       to_update->offsets_.AddRowSumMat(scale * to_update->learning_rate_,
                                        out_deriv_copy);
     }
@@ -2440,7 +2440,7 @@ void ScaleAndOffsetComponent::BackpropInternal(
     BaseFloat scale = 1.0;
     if (to_update->use_natural_gradient_ && !to_update->is_gradient_) {
       to_update->scale_preconditioner_.PreconditionDirections(
-          &in_value_reconstructed, NULL, &scale);
+          &in_value_reconstructed, &scale);
     }
     to_update->scales_.AddRowSumMat(scale * to_update->learning_rate_,
                                     in_value_reconstructed);
@@ -2506,7 +2506,7 @@ void ConstantFunctionComponent::Backprop(
         CuMatrix<BaseFloat> out_deriv_copy(out_deriv);
         BaseFloat scale = 1.0;
         to_update->preconditioner_.PreconditionDirections(&out_deriv_copy,
-                                                          NULL, &scale);
+                                                          &scale);
         to_update->output_.AddRowSumMat(scale * to_update->learning_rate_,
                                         out_deriv_copy);
       } else {
@@ -2847,8 +2847,8 @@ void NaturalGradientAffineComponent::Update(
   // than having the matrices scaled inside the preconditioning code).
   BaseFloat in_scale, out_scale;
 
-  preconditioner_in_.PreconditionDirections(&in_value_temp, NULL, &in_scale);
-  preconditioner_out_.PreconditionDirections(&out_deriv_temp, NULL, &out_scale);
+  preconditioner_in_.PreconditionDirections(&in_value_temp, &in_scale);
+  preconditioner_out_.PreconditionDirections(&out_deriv_temp, &out_scale);
 
   // "scale" is a scaling factor coming from the PreconditionDirections calls
   // (it's faster to have them output a scaling factor than to have them scale
@@ -3077,9 +3077,9 @@ void LinearComponent::Backprop(const std::string &debug_info,
       // than having the matrices scaled inside the preconditioning code).
       BaseFloat in_scale, out_scale;
       to_update->preconditioner_in_.PreconditionDirections(&in_value_temp,
-                                                           NULL, &in_scale);
+                                                           &in_scale);
       to_update->preconditioner_out_.PreconditionDirections(&out_deriv_temp,
-                                                            NULL, &out_scale);
+                                                            &out_scale);
       BaseFloat local_lrate = in_scale * out_scale * to_update->learning_rate_;
 
       to_update->params_.AddMatMat(local_lrate, out_deriv_temp, kTrans,
@@ -3753,7 +3753,7 @@ void NaturalGradientPerElementScaleComponent::Update(
   // scales_.AddRowSumMat(learning_rate_, derivs_per_frame).
 
   BaseFloat scale;
-  preconditioner_.PreconditionDirections(&derivs_per_frame, NULL, &scale);
+  preconditioner_.PreconditionDirections(&derivs_per_frame, &scale);
 
   CuVector<BaseFloat> delta_scales(scales_.Dim());
   delta_scales.AddRowSumMat(scale * learning_rate_, derivs_per_frame);
@@ -5632,7 +5632,7 @@ void LstmNonlinearityComponent::Backprop(
     BaseFloat scale = 1.0;
     if (!to_update->is_gradient_) {
       to_update->preconditioner_.PreconditionDirections(
-          &params_deriv, NULL, &scale);
+          &params_deriv, &scale);
     }
     to_update->params_.AddMat(to_update->learning_rate_ * scale,
                               params_deriv);

From ea7efbade674d16b306c951ab047b8fc39c9697c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 21 Dec 2017 19:11:00 -0800
Subject: [PATCH 028/184] [src] Fix bug in compilation with Scale()
 expressions.

---
 src/nnet3/nnet-compile.cc        | 10 +++++-----
 src/nnet3/nnet-descriptor.cc     |  2 ++
 src/nnet3/nnet-optimize-utils.cc |  1 +
 src/nnet3/nnet-optimize.cc       |  4 ++++
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index bac182a5ac5..93f35dc8615 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -449,7 +449,6 @@ void Compiler::ComputeInputLocationsList(
   const std::vector<Index> &output_indexes = step_info.output_indexes;
   const NetworkNode &node = nnet_.GetNode(step_info.node_index);
   const SumDescriptor &descriptor = node.descriptor.Part(part_index);
-
   int32 num_indexes = output_indexes.size();
   submat_locations_list->clear();
   submat_locations_list->resize(num_indexes);
@@ -664,6 +663,7 @@ void Compiler::CompileForwardSumDescriptor(
   int32 value_submatrix_index = step_info.value_parts[part_index];
   const SumDescriptor &descriptor =
       nnet_.GetNode(step_info.node_index).descriptor.Part(part_index);
+
   BaseFloat offset_term = descriptor.GetScaleForNode(-1);
   if (offset_term != 0.0) {
     computation->commands.push_back(
@@ -763,10 +763,10 @@ void Compiler::CompileForwardFromSubmatLocations(
   std::vector<int32> indexes;
   if (ConvertToIndexes(submat_locations, &input_submatrix_index, &indexes)) {
     CompileForwardFromIndexes(value_submatrix_index,
-                                    input_submatrix_index,
-                                    alpha,
-                                    indexes,
-                                    computation);
+                              input_submatrix_index,
+                              alpha,
+                              indexes,
+                              computation);
     return;
   } else {
     // There are multiple source matrices.
diff --git a/src/nnet3/nnet-descriptor.cc b/src/nnet3/nnet-descriptor.cc
index 55bbb258b52..fb3d152dc2e 100644
--- a/src/nnet3/nnet-descriptor.cc
+++ b/src/nnet3/nnet-descriptor.cc
@@ -841,6 +841,7 @@ bool GeneralDescriptor::Normalize(GeneralDescriptor *desc) {
         desc->descriptor_type_ = child->descriptor_type_;
         desc->value1_ = child->value1_;
         desc->value2_ = child->value2_;
+        desc->alpha_ = child->alpha_;
         child->descriptors_.clear();  // avoid delete in destructor.
         delete child;
         changed = true;
@@ -897,6 +898,7 @@ bool GeneralDescriptor::Normalize(GeneralDescriptor *desc) {
         desc->descriptors_.swap(child->descriptors_);
         desc->value1_ = child->value1_;
         desc->value2_ = child->value2_;
+        desc->alpha_ = child->alpha_;
         child->descriptors_.clear();  // avoid delete in destructor.
         delete child;
         changed = true;
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index d28626fee86..ded700dbbd8 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2729,6 +2729,7 @@ void ComputationExpander::ExpandRowsCommand(
   // in the vector are row-indexes into s2.
   int32 old_arg3 = c_out->arg3;
   c_out->arg3 = expanded_computation_->indexes.size();
+  c_out->alpha = c_in.alpha;
   expanded_computation_->indexes.push_back(std::vector<int32>());
   std::vector<int32> &new_indexes = expanded_computation_->indexes.back();
   const std::vector<int32> &old_indexes = computation_.indexes[old_arg3];
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 212f707aefc..e12cb7b1c42 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -852,6 +852,10 @@ const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
                       need_debug_info, num_n_values, ans);
     seconds_taken_expand_ += timer.Elapsed();
   }
+  if (GetVerboseLevel() >= 3) {
+    CheckComputation(nnet_, *ans, false);
+  }
+
   {
     Timer timer;
     ans->ComputeCudaIndexes();

From 69d0d380b5e14771f0284429d8626f02e6b3f6a1 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 22 Dec 2017 18:45:17 -0500
Subject: [PATCH 029/184] [scripts] Fixing bug in fast-lstm-layer and
 lstmb-layer whereby c was used instead of m for affine transform.

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 27 ++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 5827ea4d179..3743413ab34 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -737,17 +737,13 @@ def _generate_lstm_config(self):
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
-        # Note from Dan: I don't remember why we are applying the backprop
-        # truncation on both c and m appended together, instead of just on c.
-        # Possibly there was some memory or speed or WER reason for it which I
-        # have forgotten about now.
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
                        "{2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
+                       "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format(
                           name, input_descriptor, self.config['recurrence-scale'], delay))
         if self.config['self-stabilize']:
             configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
@@ -763,6 +759,7 @@ def _generate_lstm_config(self):
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
 
         if self.layer_type == "fast-lstm-batchnorm-layer":
             # Add the batchnorm component, if requested to include batchnorm.
@@ -914,7 +911,7 @@ def _generate_lstm_config(self):
                            name, input_dim + cell_dim, bottleneck_dim,
                            affine_str))
 
-        configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
+        configs.append("component name={0}.m_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
                 name, cell_dim))
 
         configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
@@ -932,15 +929,15 @@ def _generate_lstm_config(self):
                                                      l2_regularize_option))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
 
-        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(
-            name, cell_dim, bptrunc_str))
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
+            name, 2 * cell_dim, bptrunc_str))
         configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
             name, cell_dim))
 
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format(
+                       "IfDefined(Offset(Scale(1.0, {0}.m_trunc_memnorm), {2})))".format(
                            name, input_descriptor, delay))
         configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                        "input={0}.W_all_a".format(name))
@@ -950,13 +947,15 @@ def _generate_lstm_config(self):
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                        "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
                            name, delay))
-        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 "
-                       "dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
                        "dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name))
-        configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm "
-                       "input={0}.c_trunc".format(name))
+        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("component-node name={0}.m_trunc_memnorm component={0}.m_trunc_memnorm "
+                       "input={0}.m_trunc".format(name))
         configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                        "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))

From 823cfe738703eb938b39a9f73dbc3e7da13896a8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 22 Dec 2017 19:05:00 -0500
Subject: [PATCH 030/184] [egs] Extend compare_wer_general.sh for tedlium to
 print num-params

---
 egs/tedlium/s5_r2/local/chain/compare_wer_general.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
index 00b2d29cc88..88dde1ff0e2 100755
--- a/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
+++ b/egs/tedlium/s5_r2/local/chain/compare_wer_general.sh
@@ -102,5 +102,10 @@ for x in $*; do
   prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
   printf "% 10s" $prob
 done
+echo
 
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
 echo

From 6f781145cdadeaf351f73fc51d0e93cd51df0f22 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 22 Dec 2017 19:05:22 -0500
Subject: [PATCH 031/184] [scripts] Cosmetic fix to chain training script

---
 egs/wsj/s5/steps/nnet3/chain/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index b62f5510e3c..7f607abd8dc 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -588,7 +588,7 @@ def train(args, run_opts):
     with open("{dir}/accuracy.report".format(dir=args.dir), "w") as f:
         f.write(report)
 
-    common_lib.execute_command("steps/info/nnet3_dir_info.pl "
+    common_lib.execute_command("steps/info/chain_dir_info.pl "
                                "{0}".format(args.dir))
 
 

From acda336781a2acdbbb5f49c32712335e374faede Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 22 Dec 2017 23:53:15 -0800
Subject: [PATCH 032/184] [src] Diagonal-natural-gradient

---
 src/nnet3/natural-gradient-online.cc | 216 ++++++++++++++++++---------
 src/nnet3/natural-gradient-online.h  | 135 +++++++++++++++--
 src/nnet3/nnet-simple-component.cc   | 169 ++++++++++++++-------
 src/nnet3/nnet-simple-component.h    |  21 ++-
 4 files changed, 399 insertions(+), 142 deletions(-)

diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 9c9652559de..4c4d5a1b888 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -26,8 +26,9 @@ namespace nnet3 {
 OnlineNaturalGradient::OnlineNaturalGradient():
     rank_(40), update_period_(1), num_samples_history_(2000.0),
     num_minibatches_history_(0.0), alpha_(4.0),
-    epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(-1),
-    num_updates_skipped_(0), self_debug_(false) { }
+    epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
+    self_debug_(false),
+    diagonal_power_(0.0), diagonal_epsilon_(1.0e-03)  { }
 
 
 /**
@@ -123,6 +124,7 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   // for locking reasons it's better to use a different object.
   OnlineNaturalGradient this_copy(*this);
   this_copy.InitDefault(D);
+  this_copy.t_ = 1;  // Prevent recursion to Init() again.
 
   CuMatrix<BaseFloat> R0_copy(R0.NumRows(), R0.NumCols(), kUndefined);
   // 'num_iters' is number of iterations with the same data from a pseudorandom
@@ -152,7 +154,6 @@ void OnlineNaturalGradient::Init(const CuMatrixBase<BaseFloat> &R0) {
   W_t_.Swap(&this_copy.W_t_);
   d_t_.Swap(&this_copy.d_t_);
   rho_t_ = this_copy.rho_t_;
-  t_ = 0;
 }
 
 void OnlineNaturalGradient::PreconditionDirections(
@@ -167,25 +168,28 @@ void OnlineNaturalGradient::PreconditionDirections(
     return;
   }
 
-  read_write_mutex_.lock();
-  if (t_ == -1) // not initialized
+  if (t_ == 0) // not initialized
     Init(*X_t);
 
-  // Now t_ >= 0.
-  // We create local copies  of the class variables... this is intended for
-  // multi-threaded safety so we can't read them in an inconsistent state,
-  // but we don't really waste anything here (a copy of W_t is needed anyway,
-  // if we're to update it).
-  int32 t = t_, R = W_t_.NumRows(), D = W_t_.NumCols();
+  int32 R = W_t_.NumRows(), D = W_t_.NumCols();
   // space for W_t, J_t, K_t, L_t.
   CuMatrix<BaseFloat> WJKL_t(2 * R, D + R);
   WJKL_t.Range(0, R, 0, D).CopyFromMat(W_t_);
   BaseFloat rho_t(rho_t_);
   Vector<BaseFloat> d_t(d_t_);
-  read_write_mutex_.unlock();
 
-  BaseFloat initial_product = TraceMatMat(*X_t, *X_t, kTrans);
-  PreconditionDirectionsInternal(t, rho_t, initial_product, d_t, &WJKL_t, X_t);
+  bool updating = Updating();
+
+  BaseFloat initial_product;
+  if (diagonal_power_ == 0.0 || scale != NULL)
+    initial_product = TraceMatMat(*X_t, *X_t, kTrans);
+
+  if (diagonal_power_ == 0.0)
+    PreconditionDirectionsInternal(rho_t, initial_product,
+                                   updating, d_t, &WJKL_t, X_t);
+  else
+    PreconditionDirectionsDiagonal(rho_t, updating, d_t, &WJKL_t, X_t);
+
   if (scale) {
     if (initial_product <= 0.0) {
       *scale = 1.0;
@@ -194,6 +198,115 @@ void OnlineNaturalGradient::PreconditionDirections(
       *scale = sqrt(initial_product / final_product);
     }
   }
+  t_ += 1;
+}
+
+void OnlineNaturalGradient::PreconditionDirectionsDiagonal(
+    const BaseFloat rho_t,
+    bool updating,
+    const Vector<BaseFloat> &d_t,
+    CuMatrixBase<BaseFloat> *WJKL_t,
+    CuMatrixBase<BaseFloat> *X_t) {
+  KALDI_ASSERT(diagonal_power_ > 0.0 && diagonal_power_ <= 1.0 &&
+               (diagonal_mean_.Dim() != 0 || updating));
+
+  int32 dim = X_t->NumCols();
+
+  if (diagonal_mean_.Dim() == 0) {
+    InitDiagonalParams(*X_t);
+    updating = false;
+  }
+
+  CuVector<BaseFloat> new_diagonal_mean, new_diagonal_uvar;
+
+  if (updating) {
+    new_diagonal_mean.Resize(dim, kUndefined);
+    new_diagonal_uvar.Resize(dim, kUndefined);
+    UpdateDiagonalStats(*X_t, &new_diagonal_mean, &new_diagonal_uvar);
+  }
+
+  X_t->MulColsVec(diagonal_scale_);
+
+  PreconditionDirectionsInternal(rho_t, TraceMatMat(*X_t, *X_t, kTrans), false,
+                                 d_t, WJKL_t, X_t);
+
+  // We apply the scale both before and after the identity-plus-low-rank matrix,
+  // so that the combined matrix is symmetric.
+  X_t->MulColsVec(diagonal_scale_);
+
+
+  // If we're updating the diagonal mean and variance we do so *after*
+  // preconditioning the data.  This is out of a concern about the provability
+  // of convergence (making it independent of the current minibatch).  Most
+  // likely, in practice it would work fine updating it before, it might even be
+  // a little bit more stable.  Anyway, this is how we're doing it, and it's how
+  // we did it for the core part of the natural gradient.
+  if (updating) {
+    diagonal_mean_.Swap(&new_diagonal_mean);
+    diagonal_uvar_.Swap(&new_diagonal_uvar);
+    UpdateDiagonalScale();
+  }
+}
+
+void OnlineNaturalGradient::UpdateDiagonalStats(
+    const CuMatrixBase<BaseFloat> &X,
+    CuVectorBase<BaseFloat> *diagonal_mean_new,
+    CuVectorBase<BaseFloat> *diagonal_uvar_new){
+  int32 dim = X.NumCols(), num_rows = X.NumRows();
+  KALDI_ASSERT(diagonal_mean_new->Dim() == dim && diagonal_uvar_new->Dim() == dim &&
+               diagonal_mean_.Dim() == dim);
+  BaseFloat eta = Eta(X.NumRows());
+  // 'eta' is a value that reflects how fast we update these stats, which is
+  // smaller if we're updating them slower, but strictly less than 1.  It's
+  // basically the scale on the new stats, with 1-eta being the scale on the old
+  // stats.
+  KALDI_ASSERT(eta > 0 && eta < 1.0);
+
+  diagonal_mean_new->CopyFromVec(diagonal_mean_);
+  diagonal_uvar_new->CopyFromVec(diagonal_uvar_);
+
+  diagonal_mean_new->AddRowSumMat(eta / num_rows, X, 1.0 - eta);
+  diagonal_uvar_new->AddDiagMat2(eta / num_rows, X, kTrans, 1.0 - eta);
+}
+
+void OnlineNaturalGradient::InitDiagonalParams(
+    const CuMatrixBase<BaseFloat> &X) {
+  int32 dim = X.NumCols(), num_rows = X.NumRows();
+  diagonal_mean_.Resize(dim);
+  diagonal_uvar_.Resize(dim);
+  diagonal_mean_.AddRowSumMat(1.0 / num_rows, X, 0.0);
+  diagonal_uvar_.AddDiagMat2(1.0 / num_rows, X, kTrans, 0.0);
+  UpdateDiagonalScale();
+}
+
+
+void OnlineNaturalGradient::UpdateDiagonalScale() {
+  KALDI_ASSERT(diagonal_mean_.Dim() != 0);
+  int32 dim = diagonal_mean_.Dim();
+  if (diagonal_scale_.Dim() != dim)
+    diagonal_scale_.Resize(dim);
+  diagonal_scale_.CopyFromVec(diagonal_uvar_);
+  diagonal_scale_.AddVecVec(-1.0, diagonal_mean_, diagonal_mean_, 1.0);
+  // At this point, diagonal_scale_ is the diagonal of the (centered) variance
+  // estimated from the x and x2 statistics, prior to any flooring or
+  // scaling.
+  BaseFloat avg_variance = diagonal_scale_.Sum() / dim;
+  if (avg_variance <= 1.0e-20) {
+    // either the data is all zero or very tiny, or something went wrong.  Just
+    // set diagonal_scale_ to a constant.
+    diagonal_scale_.Set(1.0);
+  } else {
+    BaseFloat floor = diagonal_epsilon_ * avg_variance;
+    diagonal_scale_.ApplyFloor(floor);
+    // The following statement scales diagonal_scale_ so its average is close to
+    // 1, which helps keep things in a reasonable numeric range.  There is no
+    // reason why it has to be exactly one, and the whole thing is mathematically
+    // invariant to this scaling factor-- we output the scaling factor 'scale'
+    // from PreconditionDirections() so that the user can rescale so the vector
+    // 2-norm of the X_t matrix is the same as was before the natural gradent.
+    diagonal_scale_.Scale(1.0 / avg_variance);
+    diagonal_scale_.ApplyPow(-0.5 * diagonal_power_);
+  }
 }
 
 void OnlineNaturalGradient::ReorthogonalizeXt1(
@@ -320,9 +433,9 @@ void OnlineNaturalGradient::SelfTest() const {
 }
 
 void OnlineNaturalGradient::PreconditionDirectionsInternal(
-    const int32 t,
     const BaseFloat rho_t,
     const BaseFloat tr_X_Xt,
+    bool updating,
     const Vector<BaseFloat> &d_t,
     CuMatrixBase<BaseFloat> *WJKL_t,
     CuMatrixBase<BaseFloat> *X_t) {
@@ -344,46 +457,9 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
 
   H_t.AddMatMat(1.0, *X_t, kNoTrans, W_t, kTrans, 0.0);  // H_t = X_t W_t^T
 
-  bool locked = update_mutex_.try_lock();
-  if (locked) {
-    // We'll release the lock if we don't plan to update the parameters.
-
-    // Explanation of the conditions below:
-    // if (frozen_) because we don't do the update is the user called Freeze().
-    // I forget why the (t_ > t) is here; probably some race condition encountered
-    //   a long time ago.  Not important; nnet3 doesn't use multiple threads anyway.
-    // The condition:
-    // (num_updates_skipped_ < update_period_ - 1 && t_ >= num_initial_updates)
-    // means we can update if either we're in the first 10 updates (e.g. first
-    // 10 minibatches), or if we've skipped 'update_period_ - 1' batches of data
-    // without updating the parameters (this allows us to update only, say,
-    // every 4 times, for speed, after updating the first 10 times).
-
-    // Just hard-code it here that we do 10 initial updates before skipping any.
-    const int num_initial_updates = 10;
-    if (frozen_ || t_ > t || (num_updates_skipped_ < update_period_ - 1 &&
-                              t_ >= num_initial_updates)) {
-      update_mutex_.unlock();
-      // We got the lock but we were already beaten to it by another thread, or
-      // we don't want to update yet due to update_period_ > 1 (this saves
-      // compute), so release the lock.
-      locked = false;
-    }
-  }
-
-  if (!locked) {
-    // We're not updating the parameters, either because another thread is
-    // working on updating them, or because another thread already did so from
-    // the same or later starting point (making our update stale), or because
-    // update_period_ > 1.  We just apply the preconditioning and return.
-
-    // note: we don't bother with any locks before checking frozen_ or incrementing
-    // num_updates_skipped_ below, because the worst that could happen is that,
-    // on very rare occasions, we could skip one or two more updates than we
-    // intended.
-    if (!frozen_)
-      num_updates_skipped_++;
-
+  if (!updating) {
+    // We're not updating the estimate of the Fisher matrix; we just apply the
+    // preconditioning and return.
     // X_hat_t = X_t - H_t W_t
     X_t->AddMatMat(-1.0, H_t, kNoTrans, W_t, kNoTrans, 1.0);
     return;
@@ -481,22 +557,25 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
                        &L_t);
   }
 
-  // Commit the new parameters.
-  read_write_mutex_.lock();
-  KALDI_ASSERT(t_ == t);  // we already ensured this.
-  t_ = t + 1;
-  num_updates_skipped_ = 0;
   W_t_.Swap(&W_t1);
   d_t_.CopyFromVec(d_t1);
   rho_t_ = rho_t1;
 
   if (self_debug_)
     SelfTest();
+}
+
+bool OnlineNaturalGradient::Updating() const {
+  // Just hard-code it here that we do 10 initial updates before skipping any.
+  // This must be > 'num_init_iters = 3' from Init().
+  const int num_initial_updates = 10;
 
-  read_write_mutex_.unlock();
-  update_mutex_.unlock();
+  return (!frozen_ &&
+          (t_ <= num_initial_updates ||
+           (t_ - num_initial_updates) % update_period_ == 0));
 }
 
+
 BaseFloat OnlineNaturalGradient::Eta(int32 N) const {
   if (num_minibatches_history_ > 0.0) {
     KALDI_ASSERT(num_minibatches_history_ > 1.0);
@@ -610,12 +689,15 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other)
     num_samples_history_(other.num_samples_history_),
     num_minibatches_history_(other.num_minibatches_history_),
     alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
-    frozen_(other.frozen_),
-    t_(other.t_), num_updates_skipped_(other.num_updates_skipped_),
+    frozen_(other.frozen_), t_(other.t_),
     self_debug_(other.self_debug_), W_t_(other.W_t_),
-    rho_t_(other.rho_t_), d_t_(other.d_t_) {
-  // use default constructor for the mutexes.
-}
+    rho_t_(other.rho_t_), d_t_(other.d_t_),
+    diagonal_power_(other.diagonal_power_),
+    diagonal_epsilon_(other.diagonal_epsilon_),
+    diagonal_mean_(other.diagonal_mean_),
+    diagonal_uvar_(other.diagonal_uvar_),
+    diagonal_scale_(other.diagonal_scale_) { }
+
 
 OnlineNaturalGradient& OnlineNaturalGradient::operator = (
     const OnlineNaturalGradient &other) {
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index 0c43c7f5c46..f2713063492 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -411,6 +411,43 @@ namespace nnet3 {
    is that this isn't going to be a problem.
  */
 
+/**
+   DIAGONAL_EXTENSION
+
+   This comment explains the diagonal extension to the natural gradient method (this
+   was not described in the original paper).
+
+   Physically this diagonal scaling happens both before and after the main natural
+   gradient code.  I.e. the main natural gradient code (which makes use
+   of a scaled-unit-plus-low-rank factorization), happens inside the
+   space where we've applied the diagonal component of the preconditioning,
+   so the overall natural-gradient matrix is of the form:
+      diag scaled-unit-plus-low-rank diag.
+  The way this is estimated only really makes sense if diagonal_power_
+  is either zero or one, but I expect that for in-between values it will
+  work fine in practice.
+
+  The way the diagonal scaling factor is estimated is that we accumulate mean
+  and variance stats for each dimension (decaying over time like the previous
+  natural gradient stats), and set the scaling factor to some power of the
+  variance estimated this way.  The power of the variance used to get the
+  scaling factor is actually -0.5 times diagonal_power_, the factor of 0.5
+  being required because the scaling is applied twice, both before and after
+  the scaled-unit-plus-low-rank inverse-Fisher matrix, to preserve symmetry.
+
+  It may seem odd that we are taking into account the mean here, while
+  conceptually it's the uncentered covariance of the vectors that we're
+  modeling.  The reason is that any offset in the vectors we're modeling
+  can be taken into account by one of the eigenvectors of the low-rank
+  matrix, so we anticipate that taking the mean out of consideration will
+  tend to give us a better factorization.  This is all a litte bit ad-hoc.
+  It would be cleaner to formulate this whole thing as learning a factored
+  representation of the inverse Fisher matrix, but that would become
+  very complicated, so we just estimate the diagonal in this rather ad-hoc
+  way and then do the low-rank factorization of the Fisher matrix after
+  the diagonal preconditioning.
+ */
+
 
 class OnlineNaturalGradient {
  public:
@@ -434,6 +471,11 @@ class OnlineNaturalGradient {
   int32 GetRank() const { return rank_; }
   int32 GetUpdatePeriod() const { return update_period_; }
 
+  // search above for DIAGONAL_EXTENSION for explanations.  Value should
+  // be between 0 and 1.
+  void SetDiagonalPower(BaseFloat p) { diagonal_power_ = p; }
+  BaseFloat GetDiagonalPower() const { return diagonal_power_; }
+
   // see comment where 'frozen_' is declared.
   inline void Freeze(bool frozen) { frozen_ = frozen; }
 
@@ -470,16 +512,30 @@ class OnlineNaturalGradient {
  private:
 
 
-  // This is an internal function called from PreconditionDirections(),
-  // which handles some multithreading issues and then calls this function.
+  // This is an internal function called from PreconditionDirections().
   // Note: WJKL_t (dimension 2*R by D + R) is [ W_t L_t; J_t K_t ].
-  void PreconditionDirectionsInternal(const int32 t,
-                                      const BaseFloat rho_t,
+  void PreconditionDirectionsInternal(const BaseFloat rho_t,
                                       const BaseFloat tr_X_Xt,
+                                      bool updating,
+                                      const Vector<BaseFloat> &d_t,
+                                      CuMatrixBase<BaseFloat> *WJKL_t,
+                                      CuMatrixBase<BaseFloat> *X_t);
+
+  // This function is called from PreconditionDirections(), only if
+  // diagonal_power_ != 0.0 (see comment starting DIAGONAL_EXTENSION above).
+  // It takes care of the diagonal factors in the Fisher-matrix estimate
+  // and recurses to PreconditionDirectionsInternal().
+  void PreconditionDirectionsDiagonal(const BaseFloat rho_t,
+                                      bool updating,
                                       const Vector<BaseFloat> &d_t,
                                       CuMatrixBase<BaseFloat> *WJKL_t,
                                       CuMatrixBase<BaseFloat> *X_t);
 
+
+  // Works out from t_ and various class variables whether we will update
+  // the parameters on this iteration (returns true if so).
+  bool Updating() const;
+
   void ComputeEt(const VectorBase<BaseFloat> &d_t,
                  BaseFloat beta_t,
                  VectorBase<BaseFloat> *e_t,
@@ -541,6 +597,29 @@ class OnlineNaturalGradient {
   // properties.
   void SelfTest() const;
 
+
+  // This function, called only if diagonal_power_ != 0.0 (see
+  // DIAGONAL_EXTENSION comment), initializes diagonal_mean_, diagonal_uvar_ and
+  // diagonal_scale_, with stats from this minibatch (X is the vectors before
+  // preconditioning, one vector per row).
+  void InitDiagonalParams(const CuMatrixBase<BaseFloat> &X);
+
+  // This function, called only if diagonal_power_ != 0.0 (see
+  // DIAGONAL_EXTENSION comment), sets diagonal_mean_new and diagonal_uvar_new to
+  // updated versions of the diagonal stats in diagonal_mean_ and diagonal_uvar_:
+  // changed by scaling down the old stats and then adding in stats from 'X'.
+  // 'X' is the vectors (one per row) that are doing to multiplied by our
+  // natural gradient matrix.  The provided pointers will be pointers to
+  // temporaries that will later be copied to class members.
+  void UpdateDiagonalStats(const CuMatrixBase<BaseFloat> &X,
+                           CuVectorBase<BaseFloat> *diagonal_mean_new,
+                           CuVectorBase<BaseFloat> *diagonal_uvar_new);
+
+  // This function updates diagonal_scale_ from the stats in
+  // diagonal_mean_ and diagonal_uvar_.
+  void UpdateDiagonalScale();
+
+
   // Configuration values:
 
   // The rank of the correction to the unit matrix (e.g. 20).
@@ -596,15 +675,10 @@ class OnlineNaturalGradient {
   // the *second* time we see the same data (to avoid biasing the update).
   bool frozen_;
 
-  // t is a counter that measures how many updates we've done.
+  // t is a counter that measures how many times the user has previously called
+  // PreconditionDirections(); it's 0 if that has never been called.
   int32 t_;
 
-  // This keeps track of how many minibatches we've skipped updating the parameters,
-  // since the most recent update; it's used in enforcing "update_period_", which
-  // is a mechanism to avoid spending too much time updating the subspace (which can
-  // be wasteful).
-  int32 num_updates_skipped_;
-
   // If true, activates certain checks.
   bool self_debug_;
 
@@ -612,12 +686,41 @@ class OnlineNaturalGradient {
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
 
-  // Used to prevent parameters being read or written in an inconsistent state.
-  std::mutex read_write_mutex_;
+  // Things below this point relate to 'diagonal' preconditioning.
+  // Search above for DIAGONAL_EXTENSION for an in-depth explanation.
+
+  // The diagonal extension is turned off by default (diagonal_power_ == 0.0),
+  // but you can turn it on by setting diagonal_power_ (probably to some
+  // positive value not greater than 1, with 1 corresponding to natural
+  // gradient, and 0.5 corresponding to something more like Adagrad).
+  BaseFloat diagonal_power_;
+
+  // diagonal_epsilon_ (e.g. 0.001) is a floor on the diagonal elements of the
+  // variances; this is expressed relative to the average un-floored variance
+  // over all dimensions (since dynamic ranges differ considerably).
+  BaseFloat diagonal_epsilon_;
+
+  //   dim_ is not a real variable but it is useful for explaining some things
+  //   we're doing below.  It's the dimension of the vectors we're preconditioning:
+  //   D in the math and the paper.  Is is the same as W_t_.NumCols().
+  // int32 dim_;
+
+  // diagonal_mean_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
+  // moving-average mean of the vectors we're preconditioning.
+  CuVector<BaseFloat> diagonal_mean_;
+
+  // diagonal_xuvar_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
+  // decaying average over minibatches of the (diagonal) uncentered variance of
+  // the input vectors we're preconditioning.
+  CuVector<BaseFloat> diagonal_uvar_;
+
+  // diagonal_scale_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
+  // vector of scaling factors which is the diagonal part of the inverse-Fisher
+  // matrix, applied before and after the scaled-unit-plus-low-rank part.
+  // It is the (floored and rescaled) variance estimated from the stats in
+  // diagonal_mean_ and diagonal_uvar_, taken to the power -0.5 * diagonal_power_.
+  CuVector<BaseFloat> diagonal_scale_;
 
-  // This mutex is used to control which thread gets to update the
-  // parameters, in multi-threaded code.
-  std::mutex update_mutex_;
 
 };
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index f2019849117..da14c188244 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2649,16 +2649,37 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   linear_params_.Read(is, binary);
   ExpectToken(is, binary, "<BiasParams>");
   bias_params_.Read(is, binary);
+
+  BaseFloat num_samples_history, alpha;
+  int32 rank_in, rank_out, update_period;
+
   ExpectToken(is, binary, "<RankIn>");
-  ReadBasicType(is, binary, &rank_in_);
+  ReadBasicType(is, binary, &rank_in);
   ExpectToken(is, binary, "<RankOut>");
-  ReadBasicType(is, binary, &rank_out_);
+  ReadBasicType(is, binary, &rank_out);
+  if (PeekToken(is, binary) == 'D') {
+    ExpectToken(is, binary, "<DiagonalPowerInOut>");
+    BaseFloat d_in, d_out;
+    ReadBasicType(is, binary, &d_in);
+    ReadBasicType(is, binary, &d_out);
+    preconditioner_in_.SetDiagonalPower(d_in);
+    preconditioner_out_.SetDiagonalPower(d_out);
+  }
   ExpectToken(is, binary, "<UpdatePeriod>");
-  ReadBasicType(is, binary, &update_period_);
+  ReadBasicType(is, binary, &update_period);
   ExpectToken(is, binary, "<NumSamplesHistory>");
-  ReadBasicType(is, binary, &num_samples_history_);
+  ReadBasicType(is, binary, &num_samples_history);
   ExpectToken(is, binary, "<Alpha>");
-  ReadBasicType(is, binary, &alpha_);
+  ReadBasicType(is, binary, &alpha);
+
+  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+
   if (PeekToken(is, binary) == 'M') {
     // MaxChangePerSample, long ago removed; back compatibility.
     ExpectToken(is, binary, "<MaxChangePerSample>");
@@ -2687,7 +2708,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
       token != "</NaturalGradientAffineComponent>")
     KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
               << "</NaturalGradientAffineComponent>, got " << token;
-  SetNaturalGradientConfigs();
 }
 
 
@@ -2697,30 +2717,21 @@ NaturalGradientAffineComponent::NaturalGradientAffineComponent(
     AffineComponent(linear_params, bias_params, 0.001) {
   KALDI_ASSERT(bias_params.Dim() == linear_params.NumRows() &&
                bias_params.Dim() != 0);
-  num_samples_history_ = 2000.0;
-  alpha_ = 4.0;
-  rank_in_ = 20;
-  rank_out_ = 80;
-  update_period_ = 4;
-  SetNaturalGradientConfigs();
+
+  // set some default natural gradient configs.
+  preconditioner_in_.SetRank(20);
+  preconditioner_out_.SetRank(80);
+  preconditioner_in_.SetUpdatePeriod(4);
+  preconditioner_out_.SetUpdatePeriod(4);
 }
 
 void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   bool ok = true;
   std::string matrix_filename;
-  num_samples_history_ = 2000.0;
-  alpha_ = 4.0;
-  rank_in_ = 20;
-  rank_out_ = 80;
-  update_period_ = 4;
+
   is_gradient_ = false;  // not configurable; there's no reason you'd want this
 
   InitLearningRatesFromConfig(cfl);
-  cfl->GetValue("num-samples-history", &num_samples_history_);
-  cfl->GetValue("alpha", &alpha_);
-  cfl->GetValue("rank-in", &rank_in_);
-  cfl->GetValue("rank-out", &rank_out_);
-  cfl->GetValue("update-period", &update_period_);
 
   if (cfl->GetValue("matrix", &matrix_filename)) {
     CuMatrix<BaseFloat> mat;
@@ -2759,23 +2770,37 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     bias_params_.Scale(bias_stddev);
     bias_params_.Add(bias_mean);
   }
+
+  // Set natural-gradient configs.
+  BaseFloat num_samples_history = 2000.0,
+      alpha = 4.0,
+      diagonal_power_in = 0.0,
+      diagonal_power_out = 0.0;
+  int32 rank_in = 20, rank_out = 80,
+      update_period = 4;
+  cfl->GetValue("num-samples-history", &num_samples_history);
+  cfl->GetValue("alpha", &alpha);
+  cfl->GetValue("rank-in", &rank_in);
+  cfl->GetValue("rank-out", &rank_out);
+  cfl->GetValue("update-period", &update_period);
+  cfl->GetValue("diagonal-power-in", &diagonal_power_in);
+  cfl->GetValue("diagonal-power-out", &diagonal_power_out);
+
+  preconditioner_in_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_out_.SetNumSamplesHistory(num_samples_history);
+  preconditioner_in_.SetAlpha(alpha);
+  preconditioner_out_.SetAlpha(alpha);
+  preconditioner_in_.SetRank(rank_in);
+  preconditioner_out_.SetRank(rank_out);
+  preconditioner_out_.SetUpdatePeriod(update_period);
+  preconditioner_in_.SetDiagonalPower(diagonal_power_in);
+  preconditioner_out_.SetDiagonalPower(diagonal_power_out);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
   if (!ok)
     KALDI_ERR << "Bad initializer " << cfl->WholeLine();
-  SetNaturalGradientConfigs();
-}
-
-void NaturalGradientAffineComponent::SetNaturalGradientConfigs() {
-  preconditioner_in_.SetRank(rank_in_);
-  preconditioner_in_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_in_.SetAlpha(alpha_);
-  preconditioner_in_.SetUpdatePeriod(update_period_);
-  preconditioner_out_.SetRank(rank_out_);
-  preconditioner_out_.SetNumSamplesHistory(num_samples_history_);
-  preconditioner_out_.SetAlpha(alpha_);
-  preconditioner_out_.SetUpdatePeriod(update_period_);
 }
 
 void NaturalGradientAffineComponent::Write(std::ostream &os,
@@ -2786,26 +2811,39 @@ void NaturalGradientAffineComponent::Write(std::ostream &os,
   WriteToken(os, binary, "<BiasParams>");
   bias_params_.Write(os, binary);
   WriteToken(os, binary, "<RankIn>");
-  WriteBasicType(os, binary, rank_in_);
+  WriteBasicType(os, binary, preconditioner_in_.GetRank());
   WriteToken(os, binary, "<RankOut>");
-  WriteBasicType(os, binary, rank_out_);
+  WriteBasicType(os, binary, preconditioner_out_.GetRank());
+  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
+      d_out = preconditioner_out_.GetDiagonalPower();
+  if (d_in != 0.0 || d_out != 0.0) {
+    WriteToken(os, binary, "<DiagonalPowerInOut>");
+    WriteBasicType(os, binary, d_in);
+    WriteBasicType(os, binary, d_out);
+  }
   WriteToken(os, binary, "<UpdatePeriod>");
-  WriteBasicType(os, binary, update_period_);
+  WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod());
   WriteToken(os, binary, "<NumSamplesHistory>");
-  WriteBasicType(os, binary, num_samples_history_);
+  WriteBasicType(os, binary, preconditioner_in_.GetNumSamplesHistory());
   WriteToken(os, binary, "<Alpha>");
-  WriteBasicType(os, binary, alpha_);
+  WriteBasicType(os, binary, preconditioner_in_.GetAlpha());
   WriteToken(os, binary, "</NaturalGradientAffineComponent>");
 }
 
 std::string NaturalGradientAffineComponent::Info() const {
   std::ostringstream stream;
   stream << AffineComponent::Info();
-  stream << ", rank-in=" << rank_in_
-         << ", rank-out=" << rank_out_
-         << ", num-samples-history=" << num_samples_history_
-         << ", update-period=" << update_period_
-         << ", alpha=" << alpha_;
+  stream << ", rank-in=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
+         << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory()
+         << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
+         << ", alpha=" << preconditioner_in_.GetAlpha();
+  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
+      d_out = preconditioner_out_.GetDiagonalPower();
+  if (d_in != 0.0 || d_out != 0.0) {
+    stream << ", diagonal-power-in=" << d_in
+           << ", diagonal-power-out=" << d_out;
+  }
   return stream.str();
 }
 
@@ -2816,15 +2854,8 @@ Component* NaturalGradientAffineComponent::Copy() const {
 NaturalGradientAffineComponent::NaturalGradientAffineComponent(
     const NaturalGradientAffineComponent &other):
     AffineComponent(other),
-    rank_in_(other.rank_in_),
-    rank_out_(other.rank_out_),
-    update_period_(other.update_period_),
-    num_samples_history_(other.num_samples_history_),
-    alpha_(other.alpha_),
     preconditioner_in_(other.preconditioner_in_),
-    preconditioner_out_(other.preconditioner_out_) {
-  SetNaturalGradientConfigs();
-}
+    preconditioner_out_(other.preconditioner_out_) { }
 
 void NaturalGradientAffineComponent::Update(
     const std::string &debug_info,
@@ -2917,6 +2948,14 @@ void LinearComponent::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<RankInOut>");
   ReadBasicType(is, binary, &rank_in);
   ReadBasicType(is, binary, &rank_out);
+  if (PeekToken(is, binary) == 'D') {
+    ExpectToken(is, binary, "<DiagonalPowerInOut>");
+    BaseFloat d_in, d_out;
+    ReadBasicType(is, binary, &d_in);
+    ReadBasicType(is, binary, &d_out);
+    preconditioner_in_.SetDiagonalPower(d_in);
+    preconditioner_out_.SetDiagonalPower(d_out);
+  }
   ExpectToken(is, binary, "<Alpha>");
   ReadBasicType(is, binary, &alpha);
   ExpectToken(is, binary, "<NumSamplesHistory>");
@@ -2968,7 +3007,10 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   // Read various natural-gradient-related configs.
   int32 rank_in = 20, rank_out = 80, update_period = 4;
   BaseFloat alpha = 4.0,
-      num_samples_history = 2000.0;
+      num_samples_history = 2000.0,
+      diagonal_power_in = 0.0,
+      diagonal_power_out = 0.0;
+
   use_natural_gradient_ = true;
 
   cfl->GetValue("num-samples-history", &num_samples_history);
@@ -2977,6 +3019,9 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("rank-out", &rank_out);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
+  cfl->GetValue("diagonal-power-in", &diagonal_power_in);
+  cfl->GetValue("diagonal-power-out", &diagonal_power_out);
+
 
   preconditioner_in_.SetAlpha(alpha);
   preconditioner_out_.SetAlpha(alpha);
@@ -2986,7 +3031,8 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   preconditioner_out_.SetNumSamplesHistory(num_samples_history);
   preconditioner_in_.SetUpdatePeriod(update_period);
   preconditioner_out_.SetUpdatePeriod(update_period);
-
+  preconditioner_in_.SetDiagonalPower(diagonal_power_in);
+  preconditioner_out_.SetDiagonalPower(diagonal_power_out);
 
   orthonormal_constraint_ = 0.0;
   cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
@@ -3013,10 +3059,17 @@ void LinearComponent::Write(std::ostream &os,
       rank_out = preconditioner_out_.GetRank(),
       update_period = preconditioner_in_.GetUpdatePeriod();
   BaseFloat alpha = preconditioner_in_.GetAlpha(),
-      num_samples_history = preconditioner_in_.GetNumSamplesHistory();
+      num_samples_history = preconditioner_in_.GetNumSamplesHistory(),
+      d_in = preconditioner_in_.GetDiagonalPower(),
+      d_out = preconditioner_out_.GetDiagonalPower();
   WriteToken(os, binary, "<RankInOut>");
   WriteBasicType(os, binary, rank_in);
   WriteBasicType(os, binary, rank_out);
+  if (d_in != 0.0 || d_out != 0.0) {
+    WriteToken(os, binary, "<DiagonalPowerInOut>");
+    WriteBasicType(os, binary, d_in);
+    WriteBasicType(os, binary, d_out);
+  }
   WriteToken(os, binary, "<Alpha>");
   WriteBasicType(os, binary, alpha);
   WriteToken(os, binary, "<NumSamplesHistory>");
@@ -3036,6 +3089,12 @@ std::string LinearComponent::Info() const {
                       GetVerboseLevel() >= 2); // include_singular_values
   if (orthonormal_constraint_ != 0.0)
     stream << ", orthonormal-constraint=" << orthonormal_constraint_;
+  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
+      d_out = preconditioner_out_.GetDiagonalPower();
+  if (d_in != 0.0 || d_out != 0.0) {
+    stream << ", diagonal-power-in=" << d_in
+           << ", diagonal-power-out=" << d_out;
+  }
   stream << ", use-natural-gradient="
          << (use_natural_gradient_ ? "true" : "false")
          << ", rank-in=" << preconditioner_in_.GetRank()
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index ef2fbd988a5..2432c912e75 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -773,6 +773,13 @@ class LogSoftmaxComponent: public NonlinearComponent {
                             minibatches) we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
+      diagonal-power-in, diagonal-power-out
+                            Control a diagonal factor in the natural gradient
+                            factorization, for the input and output spaces
+                            respectively 0.0 = default (old-style natural
+                            gradient), 1.0 = natural gradient with the diagonal
+                            factors; 0.5 is more like a factorized type of
+                            adagrad.
 */
 class NaturalGradientAffineComponent: public AffineComponent {
  public:
@@ -805,15 +812,14 @@ class NaturalGradientAffineComponent: public AffineComponent {
   int32 update_period_;
   BaseFloat num_samples_history_;
   BaseFloat alpha_;
+  // note: the config values diagonal-power-in and diagonal-power-out
+  // are stored in the objects preconditioner_in_ and preconditioner_out_
+  // directly.
 
   OnlineNaturalGradient preconditioner_in_;
 
   OnlineNaturalGradient preconditioner_out_;
 
-  // Sets the configs rank, alpha and eta in the preconditioner objects,
-  // from the class variables.
-  void SetNaturalGradientConfigs();
-
   virtual void Update(
       const std::string &debug_info,
       const CuMatrixBase<BaseFloat> &in_value,
@@ -877,6 +883,13 @@ class NaturalGradientAffineComponent: public AffineComponent {
                             minibatches) we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
+      diagonal-power-in, diagonal-power-out
+                            Control a diagonal factor in the natural gradient
+                            factorization, for the input and output spaces
+                            respectively 0.0 = default (old-style natural
+                            gradient), 1.0 = natural gradient with the diagonal
+                            factors; 0.5 is more like a factorized type of
+                            adagrad.
 */
 class LinearComponent: public UpdatableComponent {
  public:

From 6495571fef901a3013c625d2d8b6273fd31d195c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 23 Dec 2017 03:02:01 -0500
Subject: [PATCH 033/184] Revert "[scripts] Fixing bug in fast-lstm-layer and
 lstmb-layer whereby c was used instead of m for affine transform."

This reverts commit 69d0d380b5e14771f0284429d8626f02e6b3f6a1.
---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py | 27 +++++++++++----------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 3743413ab34..5827ea4d179 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -737,13 +737,17 @@ def _generate_lstm_config(self):
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
+        # Note from Dan: I don't remember why we are applying the backprop
+        # truncation on both c and m appended together, instead of just on c.
+        # Possibly there was some memory or speed or WER reason for it which I
+        # have forgotten about now.
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
                        "{2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format(
+                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
                           name, input_descriptor, self.config['recurrence-scale'], delay))
         if self.config['self-stabilize']:
             configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
@@ -759,7 +763,6 @@ def _generate_lstm_config(self):
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
 
         if self.layer_type == "fast-lstm-batchnorm-layer":
             # Add the batchnorm component, if requested to include batchnorm.
@@ -911,7 +914,7 @@ def _generate_lstm_config(self):
                            name, input_dim + cell_dim, bottleneck_dim,
                            affine_str))
 
-        configs.append("component name={0}.m_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
+        configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
                 name, cell_dim))
 
         configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
@@ -929,15 +932,15 @@ def _generate_lstm_config(self):
                                                      l2_regularize_option))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
 
-        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
-            name, 2 * cell_dim, bptrunc_str))
+        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(
+            name, cell_dim, bptrunc_str))
         configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
             name, cell_dim))
 
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset(Scale(1.0, {0}.m_trunc_memnorm), {2})))".format(
+                       "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format(
                            name, input_descriptor, delay))
         configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                        "input={0}.W_all_a".format(name))
@@ -947,15 +950,13 @@ def _generate_lstm_config(self):
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                        "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
                            name, delay))
-        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
-                       "dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
-        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 "
+        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 "
                        "dim={1}".format(name, cell_dim))
-        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} "
+        configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
                        "dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.m_trunc_memnorm component={0}.m_trunc_memnorm "
-                       "input={0}.m_trunc".format(name))
+        configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name))
+        configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm "
+                       "input={0}.c_trunc".format(name))
         configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                        "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))

From f4c5e3d16a0920ee4f9b03917f38b69b374273b7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 23 Dec 2017 16:29:24 -0500
Subject: [PATCH 034/184] [scripts] Add bottleneck-dim to xconfig basic layers
 and output layers.

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 194 +++++++++++-------
 1 file changed, 120 insertions(+), 74 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index c8a71e15672..e62a090c25e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -402,8 +402,7 @@ def get_full_config(self):
         # the input layers need to be printed in 'init.config' (which
         # initializes the neural network prior to the LDA), in 'ref.config',
         # which is a version of the config file used for getting left and right
-        # context (it doesn't read anything for the LDA-like transform and/or
-        # presoftmax-prior-scale components)
+        # context (it doesn't read anything for the LDA-like transform).
         # In 'full.config' we write everything, this is just for reference,
         # and also for cases where we don't use the LDA-like transform.
         ans = []
@@ -430,6 +429,9 @@ class XconfigOutputLayer(XconfigLayerBase):
     Parameters of the class, and their defaults:
         input='[-1]'    :   Descriptor giving the input of the layer.
         dim=None    :   Output dimension of layer, will normally equal the number of pdfs.
+        bottleneck-dim=None    :   Bottleneck dimension of layer: if supplied, instead of
+                        an affine component we'll have a linear then affine, so a linear
+                        bottleneck, with the linear part constrained to be orthonormal.
         include-log-softmax=true    :   setting it to false will omit the
             log-softmax component- useful for chain models.
         objective-type=linear   :   the only other choice currently is
@@ -441,16 +443,6 @@ class XconfigOutputLayer(XconfigLayerBase):
             learning-rate-factor=(0.5/xent_regularize),
             normally learning-rate-factor=5.0 since xent_regularize is
             normally 0.1.
-        presoftmax-scale-file=None  :   If set, a filename for a vector that
-            will be used to scale the output of the affine component before the
-            log-softmax (if include-log-softmax=true), or before the output
-            (if not).  This is helpful to avoid instability in training due to
-            some classes having much more data than others.  The way we normally
-            create this vector is to take the priors of the classes to the
-            power -0.25 and rescale them so the average is 1.0.  This factor
-            -0.25 is referred to as presoftmax_prior_scale_power in scripts. In
-            the scripts this would normally be set to
-            config_dir/presoftmax_prior_scale.vec
         max-change=1.5 :  Can be used to change the max-change parameter in the
             affine component; this affects how much the matrix can change on each
             iteration.
@@ -462,6 +454,9 @@ class XconfigOutputLayer(XconfigLayerBase):
         ng-affine-options=''  :   Can be used supply non-default options to the affine
              layer (intended for the natural gradient but can be an arbitrary string
              to be added to the config line.  e.g. 'update-period=2'.).
+        ng-linear-options=''  :   Options, like ng-affine-options, that are passed to
+             the LinearComponent, only in bottleneck layers (i.e. if bottleneck-dim
+             is supplied).
     """
 
     def __init__(self, first_token, key_to_value, prev_names=None):
@@ -475,13 +470,13 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = {'input': '[-1]',
                        'dim': -1,
+                       'bottleneck-dim': -1,
                        'include-log-softmax': True,
                             # this would be false for chain models
                        'objective-type': 'linear',
                             # see Nnet::ProcessOutputNodeConfigLine in
                             # nnet-nnet.cc for other options
                        'learning-rate-factor': 1.0,
-                       'presoftmax-scale-file': '',
                             # used in DNN (not RNN) training when using
                             # frame-level objfns,
                        'max-change': 1.5,
@@ -489,7 +484,8 @@ def set_default_configs(self):
                        'bias-stddev': 0.0,
                        'l2-regularize': 0.0,
                        'output-delay': 0,
-                       'ng-affine-options': ''
+                       'ng-affine-options': '',
+                       'ng-linear-options': ''    # only affects bottleneck output layers.
                       }
 
     def check_configs(self):
@@ -533,8 +529,20 @@ def output_dim(self, auxiliary_output=None):
                            " layers")
 
     def get_full_config(self):
-
         ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in LSTM initialization
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+
+    def _generate_config(self):
+
+        configs = []
 
         # note: each value of self.descriptors is (descriptor, dim,
         # normalized-string, output-string).
@@ -543,10 +551,10 @@ def get_full_config(self):
         descriptor_final_string = self.descriptors['input']['final-string']
         input_dim = self.descriptors['input']['dim']
         output_dim = self.config['dim']
+        bottleneck_dim = self.config['bottleneck-dim']
         objective_type = self.config['objective-type']
         learning_rate_factor = self.config['learning-rate-factor']
         include_log_softmax = self.config['include-log-softmax']
-        presoftmax_scale_file = self.config['presoftmax-scale-file']
         param_stddev = self.config['param-stddev']
         bias_stddev = self.config['bias-stddev']
         l2_regularize = self.config['l2-regularize']
@@ -558,64 +566,70 @@ def get_full_config(self):
         l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize)
                                 if l2_regularize != 0.0 else '')
 
-        # note: ref.config is used only for getting the left-context and
-        # right-context of the network;
-        # final.config is where we put the actual network definition.
-        for config_name in ['ref', 'final']:
-            # First the affine node.
-            line = ('component name={0}.affine'
-                    ' type=NaturalGradientAffineComponent'
-                    ' input-dim={1}'
-                    ' output-dim={2}'
-                    ' param-stddev={3}'
-                    ' bias-stddev={4}'
-                    ' max-change={5} {6} {7} {8}'
-                    ''.format(self.name, input_dim, output_dim,
-                              param_stddev, bias_stddev, max_change, ng_affine_options,
-                              learning_rate_option, l2_regularize_option))
-            ans.append((config_name, line))
-
-            line = ('component-node name={0}.affine'
-                    ' component={0}.affine input={1}'
-                    ''.format(self.name, descriptor_final_string))
-            ans.append((config_name, line))
-            cur_node = '{0}.affine'.format(self.name)
-
-            if presoftmax_scale_file is not '' and config_name == 'final':
-                # don't use the presoftmax-scale in 'ref.config' since that
-                # file won't exist at the time we evaluate it.
-                # (ref.config is used to find the left/right context).
-                line = ('component name={0}.fixed-scale'
-                        ' type=FixedScaleComponent scales={1}'
-                        ''.format(self.name, presoftmax_scale_file))
-                ans.append((config_name, line))
-
-                line = ('component-node name={0}.fixed-scale'
-                        ' component={0}.fixed-scale input={1}'
-                        ''.format(self.name, cur_node))
-                ans.append((config_name, line))
-                cur_node = '{0}.fixed-scale'.format(self.name)
+        cur_node = descriptor_final_string
+        cur_dim = input_dim
+
+        if bottleneck_dim >= 0:
+            if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim:
+                raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format(
+                    bottleneck_dim))
+            # This is the bottleneck case (it doesn't necessarily imply we
+            # will be using the features from the bottleneck; it's just a factorization
+            # of the matrix into two pieces without a nonlinearity in between).
+            # We don't include the l2-regularize option because it's useless
+            # given the orthonormality constraint.
+            linear_options = self.config['ng-linear-options']
+
+            # note: by default the LinearComponent uses natural gradient.
+            line = ('component name={0}.linear type=LinearComponent '
+                    'orthonormal-constraint=1.0 input-dim={1} output-dim={2} '
+                    'max-change=0.75 {3}'
+                    ''.format(self.name, input_dim, bottleneck_dim, linear_options))
+            configs.append(line)
+            line = ('component-node name={0}.linear component={0}.linear input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.linear'.format(self.name)
+            cur_dim = bottleneck_dim
+
+
+        line = ('component name={0}.affine'
+                ' type=NaturalGradientAffineComponent'
+                ' input-dim={1}'
+                ' output-dim={2}'
+                ' param-stddev={3}'
+                ' bias-stddev={4}'
+                ' max-change={5} {6} {7} {8}'
+                ''.format(self.name, cur_dim, output_dim,
+                          param_stddev, bias_stddev, max_change, ng_affine_options,
+                          learning_rate_option, l2_regularize_option))
+        configs.append(line)
+        line = ('component-node name={0}.affine'
+                ' component={0}.affine input={1}'
+                ''.format(self.name, cur_node))
+        configs.append(line)
+        cur_node = '{0}.affine'.format(self.name)
 
-            if include_log_softmax:
-                line = ('component name={0}.log-softmax'
-                        ' type=LogSoftmaxComponent dim={1}'
-                        ''.format(self.name, output_dim))
-                ans.append((config_name, line))
+        if include_log_softmax:
+            line = ('component name={0}.log-softmax'
+                    ' type=LogSoftmaxComponent dim={1}'
+                    ''.format(self.name, output_dim))
+            configs.append(line)
 
-                line = ('component-node name={0}.log-softmax'
-                        ' component={0}.log-softmax input={1}'
-                        ''.format(self.name, cur_node))
-                ans.append((config_name, line))
-                cur_node = '{0}.log-softmax'.format(self.name)
+            line = ('component-node name={0}.log-softmax'
+                    ' component={0}.log-softmax input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.log-softmax'.format(self.name)
 
-            if output_delay != 0:
-                cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
+        if output_delay != 0:
+            cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay)
 
-            line = ('output-node name={0} input={1} '
-                    'objective={2}'.format(
-                        self.name, cur_node, objective_type))
-            ans.append((config_name, line))
-        return ans
+        line = ('output-node name={0} input={1} '
+                'objective={2}'.format(
+                    self.name, cur_node, objective_type))
+        configs.append(line)
+        return configs
 
 
 class XconfigBasicLayer(XconfigLayerBase):
@@ -657,9 +671,11 @@ def set_default_configs(self):
         # the most recent layer.
         self.config = {'input': '[-1]',
                        'dim': -1,
+                       'bottleneck-dim': -1,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
                        'ng-affine-options': '',
+                       'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
                                                    # affects layers with
                                                    # 'dropout' in the name.
@@ -674,6 +690,10 @@ def set_default_configs(self):
     def check_configs(self):
         if self.config['dim'] < 0:
             raise RuntimeError("dim has invalid value {0}".format(self.config['dim']))
+        b = self.config['bottleneck-dim']
+        if b >= 0 and (b >= self.config['dim'] or b == 0):
+            raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b))
+
         if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
             raise RuntimeError("self-repair-scale has invalid value {0}"
                                .format(self.config['self-repair-scale']))
@@ -751,14 +771,40 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                                    "there is a final 'renorm' component.")
 
         configs = []
-        # First the affine node.
+        cur_dim = input_dim
+        cur_node = input_desc
+
+        # First the affine node (or linear then affine, if bottleneck).
+        if self.config['bottleneck-dim'] > 0:
+            # This is the bottleneck case (it doesn't necessarily imply we
+            # will be using the features from the bottleneck; it's just a factorization
+            # of the matrix into two pieces without a nonlinearity in between).
+            # We don't include the l2-regularize option because it's useless
+            # given the orthonormality constraint.
+            linear_options = self.config['ng-linear-options']
+            for opt_name in [ 'max-change', 'learning-rate-factor' ]:
+                value = self.config[opt_name]
+                if value != '':
+                    linear_options += ' {0}={1}'.format(opt_name, value)
+            bottleneck_dim = self.config['bottleneck-dim']
+            # note: by default the LinearComponent uses natural gradient.
+            line = ('component name={0}.linear type=LinearComponent '
+                    'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}'
+                    ''.format(self.name, input_dim, bottleneck_dim, linear_options))
+            configs.append(line)
+            line = ('component-node name={0}.linear component={0}.linear input={1}'
+                    ''.format(self.name, cur_node))
+            configs.append(line)
+            cur_node = '{0}.linear'.format(self.name)
+            cur_dim = bottleneck_dim
+
+
         line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
-                ''.format(self.name, input_dim, output_dim, affine_options))
+                ''.format(self.name, cur_dim, output_dim, affine_options))
         configs.append(line)
-
         line = ('component-node name={0}.affine component={0}.affine input={1}'
-                ''.format(self.name, input_desc))
+                ''.format(self.name, cur_node))
         configs.append(line)
         cur_node = '{0}.affine'.format(self.name)
 

From 421a062477d732fc02e2109b9d50857ae0f18661 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 27 Dec 2017 00:03:33 -0500
Subject: [PATCH 035/184] [src] Some bug fixes; change to
 natural-gradient-online RE last dim

---
 egs/mini_librispeech/s5/local/chain/compare_wer.sh |  6 ++++++
 src/nnet3/natural-gradient-online.cc               | 12 +++++++++++-
 src/rnnlm/rnnlm-embedding-training.cc              | 10 ++++------
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/egs/mini_librispeech/s5/local/chain/compare_wer.sh b/egs/mini_librispeech/s5/local/chain/compare_wer.sh
index cd6be14ed88..8ee5db2326a 100755
--- a/egs/mini_librispeech/s5/local/chain/compare_wer.sh
+++ b/egs/mini_librispeech/s5/local/chain/compare_wer.sh
@@ -129,3 +129,9 @@ for x in $*; do
   printf "% 10s" $prob
 done
 echo
+
+echo -n "# Num-params              "
+for x in $*; do
+  printf "% 10s" $(grep num-parameters $x/log/progress.1.log | awk '{print $2}')
+done
+echo
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 4c4d5a1b888..83702626f5f 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -18,6 +18,7 @@
 // limitations under the License.
 
 #include "nnet3/natural-gradient-online.h"
+#include "nnet3/nnet-parse.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -286,7 +287,16 @@ void OnlineNaturalGradient::UpdateDiagonalScale() {
   if (diagonal_scale_.Dim() != dim)
     diagonal_scale_.Resize(dim);
   diagonal_scale_.CopyFromVec(diagonal_uvar_);
-  diagonal_scale_.AddVecVec(-1.0, diagonal_mean_, diagonal_mean_, 1.0);
+  // Because the last element may be the offset, and it doesn't
+  // make sense to subtract its mean for this purpose, only do this for
+  // the all-but-last-elements.
+  if (dim > 1) {
+    CuSubVector<BaseFloat> diagonal_scale_part(diagonal_scale_, 0, dim - 1),
+        diagonal_mean_part(diagonal_mean_, 0, dim - 1);
+    diagonal_scale_part.AddVecVec(-1.0, diagonal_mean_part,
+                                  diagonal_mean_part, 1.0);
+  }
+
   // At this point, diagonal_scale_ is the diagonal of the (centered) variance
   // estimated from the x and x2 statistics, prior to any flooring or
   // scaling.
diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index 00d939da5be..47b347047fb 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -77,12 +77,11 @@ void RnnlmEmbeddingTrainer::Train(
     if (l2_term != 0.0) {
       embedding_deriv->AddMat(l2_term, *embedding_mat_);
     }
-  } 
+  }
 
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
-    preconditioner_.PreconditionDirections(embedding_deriv, NULL,
-                                           &scale);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
   }
   scale *= config_.learning_rate;
   num_minibatches_++;
@@ -130,11 +129,10 @@ void RnnlmEmbeddingTrainer::Train(
     if (l2_term != 0.0) {
       embedding_deriv->AddMat(l2_term, *embedding_mat_);
     }
-  } 
+  }
   BaseFloat scale = 1.0;
   if (config_.use_natural_gradient) {
-    preconditioner_.PreconditionDirections(embedding_deriv, NULL,
-                                           &scale);
+    preconditioner_.PreconditionDirections(embedding_deriv, &scale);
   }
   scale *= config_.learning_rate;
   num_minibatches_++;

From e71ddae4fca84f8f10efb30e578d50030de5259c Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 27 Dec 2017 19:24:36 -0800
Subject: [PATCH 036/184] [src] Reorganize batch-norm code and add power
 option.

---
 src/nnet3/nnet-normalize-component.cc | 233 ++++++++++++++------------
 src/nnet3/nnet-normalize-component.h  |  15 +-
 src/nnet3/nnet-test-utils.cc          |   1 +
 3 files changed, 135 insertions(+), 114 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 1e3314bf91f..3f105cd8e2b 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -234,8 +234,8 @@ void BatchNormComponent::ComputeDerived() {
   // of numerical roundoff.
   scale_.ApplyFloor(0.0);
   scale_.Add(epsilon_);
-  scale_.ApplyPow(-0.5);
-  // now scale_ = min(variance, epsilon)^{-0.5}.
+  scale_.ApplyPow(power_);
+  // now scale_ = min(variance, epsilon)^power_
   // next, multiply by the target RMS (normally 1.0).
   scale_.Scale(target_rms_);
   offset_.MulElements(scale_);
@@ -253,10 +253,10 @@ void BatchNormComponent::Check() const {
 }
 
 BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
-    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
-    target_rms_(other.target_rms_), test_mode_(other.test_mode_),
-    count_(other.count_), stats_sum_(other.stats_sum_),
-    stats_sumsq_(other.stats_sumsq_) {
+    dim_(other.dim_), block_dim_(other.block_dim_), power_(other.power_),
+    epsilon_(other.epsilon_), target_rms_(other.target_rms_),
+    test_mode_(other.test_mode_), count_(other.count_),
+    stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) {
   ComputeDerived();
   Check();
 }
@@ -267,6 +267,7 @@ std::string BatchNormComponent::Info() const {
   stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
          << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
          << ", count=" << count_
+         << ", power=" << power_
          << ", test-mode=" << (test_mode_ ? "true" : "false");
   if (count_ > 0) {
     Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
@@ -285,12 +286,14 @@ std::string BatchNormComponent::Info() const {
 void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
   dim_ = -1;
   block_dim_ = -1;
+  power_ = -0.5;
   epsilon_ = 1.0e-03;
   target_rms_ = 1.0;
   test_mode_ = false;
   bool ok = cfl->GetValue("dim", &dim_);
   cfl->GetValue("block-dim", &block_dim_);
   cfl->GetValue("epsilon", &epsilon_);
+  cfl->GetValue("power", &power_);
   cfl->GetValue("target-rms", &target_rms_);
   cfl->GetValue("test-mode", &test_mode_);
   if (!ok || dim_ <= 0) {
@@ -304,6 +307,8 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
+  if (power_ >= 0 || power_ <= -1.0)
+    KALDI_ERR << "Power has invalid value " << power_;
   count_ = 0;
   stats_sum_.Resize(block_dim_);
   stats_sumsq_.Resize(block_dim_);
@@ -325,95 +330,72 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
 
   FORWARD PASS:
 
-  Define xsum  = sum_i x(i)
-         x2sum = sum_i x(i)^2
-          mean = xsum / n
-           var = x2sum / n - (mean*mean)
-         scale = sqrt(var + epsilon)^{-0.5}
-        offset = -mean * scale
+  Let 'power' be a constant, equal to -0.5 for regular batch-norm.
+
+  To simplify the math we (conceptually, not physically) do the normalization in
+  two stages: first mean, then variance, so we have x(i) -> y(i) -> z(i).
+
+  The name 'rscale' means 'raw scale', meaning the scale before including
+  target-rms.  Later we'll define 'scale = target-rms * rscale', to make some
+  of the actual computations slightly more efficient.
+
+  Define:   mean = 1/I * sum_i x(i)
+            y(i) = x(i) - mean
 
-      y(i) = scale * x(i) + offset
+            var = 1/I \sum_i y(i)^2
+         rscale = sqrt(var + epsilon)^power   <---- For regular batchnorm, power == -0.5.
+           z(i) = target-rms * rscale * y(i)
 
-   Most of the rest of this comment derives how to compute the derivatives.  If
-   you just want the formulas, please skip to the string 'BACKWARD PASS' below.
+
+  Most of the rest of this comment derives how to compute the derivatives.  If
+  you just want the formulas, please skip to the string 'BACKWARD PASS' below.
 
   We'll use a notation where an apostrophe on something means (the derivative of
   the objective function w.r.t. that thing), so y'(i) is df/dy(i), and so on.
   We are given y'(i).  Propagating the derivatives backward:
-     offset' = sum_i y'(i)
-     scale' = (sum_i y'(i) * x(i)) - offset' * mean
-       var' = scale' * -0.5 * sqrt(var + epsilon)^{-1.5}
-            = -0.5 * scale' * scale^3
-      mean' = -offset' * scale - 2 * mean * var'
-      xsum' = mean' / n
-     x2sum' = var' / n
-
-  So the derivatives propagated back to the original data are:
-     x'(i) = y'(i) * scale  +  xsum'  +  x(i) * x2sum'
-
-  The above is quite complicated to compute, but we can use some invariances
-  to work out a simpler way to compute the derivatives.
-
-  Firstly, note that x'(i) is of the form:
-
-   x'(i) =  y'(i) * scale + [affine function of x(i)].
-
-   [it's a 1-d affine function, i.e. offset and scale].
- This has the same functional form as:
-
-  x'(i) =  y'(i) * scale + [affine function of y(i)].
-
-  since y(i) is an affine function of x(i) with nonzero scale.
-  Because the output is invariant to shifts in the input, sum_i x'(i)
-  will be zero.  This is sufficient to determine the bias
-  term in the affine function.  [Note: the scale on y(i) doesn't
-  come into it because the y(i) sum to zero].  The offset
-  will just be (sum_i y'(i) * scale / n); this makes the sum of x'(i) zero.
-  So let's write it as
-
-    x'(i) =  (y'(i) - 1/n sum_i y'(i)) * scale + alpha y(i).
-
-  and it will be convenient to define:
-
-  x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-
-  which is just y'(i) with mean subtraction, scaled according to
-  the scale used in the normalization.  So write
-
-   x'(i) = x_deriv_base(i) + alpha y(i).
-
- The question is, what is the scale alpha.  We don't actually need to
- do any differentiation to figure this out.  First, assume there is
- no "+ epsilon" in the variance; later we'll explain why this doesn't
- matter.  The key to working out alpha is that the output is invariant
- to scaling of the input.  Assume we scale around the input's mean,
- since that makes the math simpler.  We can express this by the
- constraint that (\sum_i x'(i) * (x(i) - avg-x)) = 0.  This is
- equivalent to the constraint that (\sum_i x'(i) y (i)) = 0, since
- y(i) is x(i) - avg-x times a nonzero scale.  We'll use this contraint
- to determine alpha, Using the above expressionfor x(i), we can write
- this constraint as:
-   \sum_i ( y(i) x_deriv_base(i)  + alpha y(i) y(i)) = 0.
- Now, since we said we'd ignore the epsilon, the output has unit variance,
- so we know that \sum_i y(i) y(i) = n.
- So alpha = - \sum_i y(i) x_deriv_base(i) / n.  We can actually re-imagine
- the epsilon term (or variance-flooring) as having been implemented by
- adding a couple extra rows to the matrix with suitable values, and zero
- output-deriv for those rows.  If you think about it carefully you'll see that
- the formula above is valid even if there is an extra term
- in the variance.  Anyway the correctness of the derivative will get tested
- throughly by the component unit-tests.
-
- So to recap, here is the backprop.
-
- BACKWARD PASS:
-
-  We are given y'(i), scale, and y(i).
-
-  We compute:
-    x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-              alpha = - \sum_i y(i) x_deriv_base(i) / n
-              x'(i) = x_deriv_base(i) + alpha y(i)
+
+    rscale' = (sum_i y(i) z'(i)) * target-rms
+            = (sum_i z(i) z'(i)) / rscale
+
+  [ note: d(rscale)/d(var) = power * (var + epsilon)^{power - 1}
+                           = power * rscale^{(power-1)/power}  ]
+
+    var' = rscale' * power * rscale^{(power-1)/power}
+         = power * (\sum_i z'(i) z(i)) * rscale^{(power-1)/power - 1}
+         = power * (\sum_i z'(i) z(i)) * rscale^{-1/power}
+
+  [note: the following formula is of the form "direct term" + "indirect term"]
+    y'(i) =  z'(i) * target-rms * rscale   +    2/I y(i) var'
+
+  Now, the above is inconvenient because it contains y(i) which is an intermediate
+  quantity.  We reformulate in terms of z(i), using y(i) = z(i) / (target-rms * rscale), so:
+
+  defining
+   var_deriv_mod = 2/I * var' / (target-rms * rscale)
+                 = 2/I * power/target-rms * (\sum_i z'(i) z(i)) * rscale^{-(1+power)/power}
+ we have:
+    y'(i) =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod
+
+ Now,
+    mean' = \sum_i y'(i)
+          = (target-rms * rscale * \sum_i z'(i))  +  (var_deriv_mod \sum_i z(i))
+     [... and the 2nd term above is zero when summed over i, because \sum_i z(i) is zero, ...]
+          = target-rms * rscale * \sum_i z(i)
+ and:
+    x'(i) =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod   -  1/I mean'
+          =  z'(i) * target-rms * rscale   +    z(i) var_deriv_mod   -  1/I * target-rms * rscale * \sum_i z'(i)
+          =  target-rms * rscale * (z'(i) - 1/I * \sum_i z'(i))  +  z(i) var_deriv_mod
+
+    It will simplify the code if we define:
+
+      scale = target-rms * rscale.  This way, we can write as follows:
+
+  BACKWARD PASS (recap):
+
+   var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+
+           x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
+
   */
 
 
@@ -446,7 +428,7 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     Memo *memo = new Memo;
     int32 num_frames = in.NumRows(), dim = block_dim_;
     memo->num_frames = num_frames;
-    memo->mean_uvar_scale.Resize(4, dim);
+    memo->mean_uvar_scale.Resize(5, dim);
     CuSubVector<BaseFloat> mean(memo->mean_uvar_scale, 0),
         uvar(memo->mean_uvar_scale, 1),
         scale(memo->mean_uvar_scale, 2);
@@ -454,14 +436,14 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
     scale.CopyFromVec(uvar);
     // by applying this scale at this point, we save a multiply later on.
-    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
+    BaseFloat var_scale = std::pow(target_rms_, -power_);
     scale.AddVecVec(-var_scale, mean, mean, var_scale);
-    // at this point, 'scale' contains just the variance [divided by target-rms^2].
+    // at this point, 'scale' contains just the variance (times target-rms^{-power})
     scale.ApplyFloor(0.0);
     scale.Add(var_scale * epsilon_);
     // Now 'scale' contains the variance floored to zero and then with epsilon
-    // added [both divided by target-rms^2].
-    scale.ApplyPow(-0.5);
+    // added [both times target-rms^{-power}]
+    scale.ApplyPow(power_);
     // now 'scale' is the actual scale we'll use.
 
     // the next command will do no work if out == in, for in-place propagation.
@@ -525,26 +507,47 @@ void BatchNormComponent::Backprop(
     KALDI_ASSERT(memo != NULL && "memo not passed into backprop");
     int32 num_frames = memo->num_frames;
     KALDI_ASSERT(out_value.NumRows() == num_frames);
-    CuSubVector<BaseFloat> temp(memo->mean_uvar_scale, 3),
-        scale(memo->mean_uvar_scale, 2);
+    CuSubVector<BaseFloat>
+        scale(memo->mean_uvar_scale, 2),
+        temp(memo->mean_uvar_scale, 4),
+        var_deriv_mod(memo->mean_uvar_scale, 3),
+        scale_pow(memo->mean_uvar_scale, 4);
+
+    // var_deriv_mod is going to contain:
+    //  2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+    // but for now we don't have the power of 'scale', we'll add that later.
+    BaseFloat coeff = 2.0 * power_ * std::pow(target_rms_, 1.0 / power_) /
+        num_frames;
+    var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans,
+                                out_deriv, kNoTrans, 0.0);
+
+
     temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
-    // the following does no work if in_deriv and out_deriv are the same matrix.
+    // the following statement does no work if in_deriv and out_deriv are the same matrix.
     in_deriv->CopyFromMat(out_deriv);
     in_deriv->AddVecToRows(1.0, temp);
+    // At this point, *in_deriv contains
+    // (z'(i) - 1/I * \sum_i z'(i))
     in_deriv->MulColsVec(scale);
-    // at this point, 'in_deriv' contains:
-    // x_deriv_base(i) = (y'(i) - 1/n sum_i y'(i)) * scale
-    temp.AddDiagMatMat(-1.0 / (num_frames * target_rms_ * target_rms_),
-                       out_value, kTrans, *in_deriv, kNoTrans, 0.0);
-    // now, 'temp' contains the quantity which we described
-    // in the math as:
-    // alpha = - \sum_i y(i) x_deriv_base(i) / n.
-    // The factor 1 / (target_rms_ * target_rms_) comes from following
-    // this additional scaling factor through the math.  In the comment I said
-    // "we know that \sum_i y(i) y(i) = n".  Taking target-rms into account
-    // this becomes "we know that \sum_i y(i) y(i) = n * target-rms^2".
-    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans, temp, 1.0);
-    // At this point, in_deriv contains  x'(i) = x_deriv_base(i) + alpha y(i).
+    // At this point, *in_deriv contains
+    // scale * (z'(i) - 1/I * \sum_i z'(i))
+
+    // The next few lines complete the calculation of 'var_deriv_mod';
+    // we delayed it because we were using 'temp', and 'scale_pow'
+    // uses the same memory.
+    if (power_ == -0.5) {
+      // we can simplify scale^{-(1+power)/power} to just 'scale'.
+      var_deriv_mod.MulElements(scale);
+    } else {
+      scale_pow.CopyFromVec(scale);
+      scale_pow.ApplyPow(-1.0 * (1.0 + power_) / power_);
+      var_deriv_mod.MulElements(scale_pow);
+    }
+    in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans,
+                            var_deriv_mod, 1.0);
+    // At this point, *in_deriv contains what we described in the comment
+    // starting BATCHNORM_MATH as:
+    // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
   } else {
     KALDI_ASSERT(offset_.Dim() == block_dim_);
     // the next call does no work if they point to the same memory.
@@ -598,6 +601,12 @@ void BatchNormComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &dim_);
   ExpectToken(is, binary, "<BlockDim>");
   ReadBasicType(is, binary, &block_dim_);
+  if (PeekToken(is, binary) == 'P') {
+    ExpectToken(is, binary, "<Power>");
+    ReadBasicType(is, binary, &power_);
+  } else {
+    power_ = -0.5;
+  }
   ExpectToken(is, binary, "<Epsilon>");
   ReadBasicType(is, binary, &epsilon_);
   ExpectToken(is, binary, "<TargetRms>");
@@ -625,6 +634,10 @@ void BatchNormComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<BlockDim>");
   WriteBasicType(os, binary, block_dim_);
+  if (power_ != -0.5) {
+    WriteToken(os, binary, "<Power>");
+    WriteBasicType(os, binary, power_);
+  }
   WriteToken(os, binary, "<Epsilon>");
   WriteBasicType(os, binary, epsilon_);
   WriteToken(os, binary, "<TargetRms>");
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 5299862ee65..84b5dbd817a 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -227,13 +227,11 @@ class BatchNormComponent: public Component {
   struct Memo {
     // number of frames (after any reshaping).
     int32 num_frames;
-    // 'sum_sumsq_scale' is of dimension 4 by block_dim_:
+    // 'sum_sumsq_scale' is of dimension 5 by block_dim_:
     // Row 0 = mean = the mean of the rows of the input
     // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
     // Row 2 = scale = the scale of the renormalization, which is
-    // Row 3 is used as a temporary in Backprop.
-    //    the inverse stddev of the input (modified by epsilon_,
-    //    see the Propagate function.
+    // Rows 3 and 4 are used as a temporaries in Backprop.
     CuMatrix<BaseFloat> mean_uvar_scale;
   };
 
@@ -260,6 +258,12 @@ class BatchNormComponent: public Component {
   // always will in the new code in nnet-convolutional-component.h.
   int32 block_dim_;
 
+
+  // This power determines the scale as a power of the variance... the default
+  // (-0.5) corresponds to regular BatchNorm, but you can set it to other
+  // values, like -0.25 or -0.4, for what we'll call "fractional BatchNorm"
+  BaseFloat power_;
+
   // Used to avoid exact-zero variances, epsilon has the dimension of a
   // covariance.
   BaseFloat epsilon_;
@@ -311,6 +315,9 @@ class BatchNormComponent: public Component {
                       is treated like a separate row of the input matrix, which
                       means that the stats from n'th element of each
                       block are pooled into one class, for each n.a
+         power        Power that determines the scale we apply, as a function of
+                      the variance.  The default, -0.5, corresponds to regular
+                      BatchNorm.
          epsilon      Small term added to the variance that is used to prevent
                       division by zero
          target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 83b902a9b90..781b4e558e9 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1681,6 +1681,7 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " block-dim=" << block_dim << " target-rms="
          << RandInt(1, 2) << " test-mode="
          << (test_mode ? "true" : "false")
+         << " power=" << (-0.1 * RandInt(3, 5))
          << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
       break;
     }

From 6870287eaa9d43ebec5e360c124808e00b130804 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 27 Dec 2017 21:29:26 -0800
Subject: [PATCH 037/184] [src] Fix bug in normalize-component RE target-rms

---
 src/nnet3/nnet-normalize-component.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index 3f105cd8e2b..ad5fc2466d4 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -320,7 +320,7 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
 
 
 /*
-  BATCH_NORM_MATH
+  BATCHNORM_MATH
 
   This comment describes the equations involved in batch normalization, and
   derives the forward and back-propagation.
@@ -436,7 +436,7 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
     scale.CopyFromVec(uvar);
     // by applying this scale at this point, we save a multiply later on.
-    BaseFloat var_scale = std::pow(target_rms_, -power_);
+    BaseFloat var_scale = std::pow(target_rms_, 1.0 / power_);
     scale.AddVecVec(-var_scale, mean, mean, var_scale);
     // at this point, 'scale' contains just the variance (times target-rms^{-power})
     scale.ApplyFloor(0.0);
@@ -523,7 +523,8 @@ void BatchNormComponent::Backprop(
 
 
     temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
-    // the following statement does no work if in_deriv and out_deriv are the same matrix.
+    // the following statement does no work if in_deriv and out_deriv are the
+    // same matrix.
     in_deriv->CopyFromMat(out_deriv);
     in_deriv->AddVecToRows(1.0, temp);
     // At this point, *in_deriv contains

From af1817591e847f0fcca0bdaa16ae3c271fc4ce1d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 29 Dec 2017 15:41:49 -0500
Subject: [PATCH 038/184] [scripts] Add batchnorm-power and diagonal-power
 options to basic_layers.py

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index e62a090c25e..09681625034 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -674,6 +674,7 @@ def set_default_configs(self):
                        'bottleneck-dim': -1,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
+                       'batchnorm-power': -0.5,
                        'ng-affine-options': '',
                        'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
@@ -685,6 +686,8 @@ def set_default_configs(self):
                        'bias-stddev': '',
                        'l2-regularize': '',
                        'learning-rate-factor': '',
+                       'diagonal-power-in': '',
+                       'diagonal-power-out': '',
                        'max-change': 0.75 }
 
     def check_configs(self):
@@ -753,10 +756,12 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
         output_dim = self.output_dim()
         self_repair_scale = self.config['self-repair-scale']
         target_rms = self.config['target-rms']
+        batchnorm_power = self.config['batchnorm-power']
 
         affine_options = self.config['ng-affine-options']
         for opt_name in [ 'max-change', 'learning-rate-factor',
-                          'bias-stddev', 'l2-regularize' ]:
+                          'bias-stddev', 'l2-regularize',
+                          'diagonal-power-in', 'diagonal-power-out' ]:
             value = self.config[opt_name]
             if value != '':
                 affine_options += ' {0}={1}'.format(opt_name, value)
@@ -844,9 +849,9 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             elif nonlinearity == 'batchnorm':
                 line = ('component name={0}.{1}'
                         ' type=BatchNormComponent dim={2}'
-                        ' target-rms={3}'
+                        ' target-rms={3} power={4}'
                         ''.format(self.name, nonlinearity, output_dim,
-                                  target_rms))
+                                  target_rms, batchnorm_power))
 
             elif nonlinearity == 'memnorm':
                 line = ('component name={0}.{1}'

From 8368ab4e7320862c56205e6b1cba26cd8d2240e7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 29 Dec 2017 18:44:17 -0800
Subject: [PATCH 039/184] [src,scripts] Remove MemoryNormComponent

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |   7 -
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |   7 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 -
 src/nnet3/nnet-component-itf.cc               |   2 -
 src/nnet3/nnet-component-test.cc              |   6 +-
 src/nnet3/nnet-compute.cc                     |   6 +-
 src/nnet3/nnet-normalize-component.cc         | 527 ------------------
 src/nnet3/nnet-normalize-component.h          | 245 +-------
 src/nnet3/nnet-simple-component.cc            |   6 -
 src/nnet3/nnet-test-utils.cc                  |   4 +-
 src/nnet3/nnet-utils.cc                       |   9 +-
 src/nnet3/nnet-utils.h                        |  10 +-
 12 files changed, 22 insertions(+), 808 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 09681625034..6393709d82e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -853,13 +853,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim,
                                   target_rms, batchnorm_power))
 
-            elif nonlinearity == 'memnorm':
-                line = ('component name={0}.{1}'
-                        ' type=MemoryNormComponent dim={2}'
-                        ' target-rms={3} '
-                        ''.format(self.name, nonlinearity, output_dim,
-                                  target_rms))
-
             elif nonlinearity == 'so':
                 line = ('component name={0}.{1}'
                         ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 '
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 5827ea4d179..d226a891113 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -914,9 +914,6 @@ def _generate_lstm_config(self):
                            name, input_dim + cell_dim, bottleneck_dim,
                            affine_str))
 
-        configs.append("component name={0}.c_trunc_memnorm type=MemoryNormComponent dim={1} ".format(
-                name, cell_dim))
-
         configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
                        "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,
                                                        affine_str, l2_regularize_option))
@@ -940,7 +937,7 @@ def _generate_lstm_config(self):
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc_memnorm, {2})))".format(
+                       "IfDefined(Offset({0}.c_trunc, {2})))".format(
                            name, input_descriptor, delay))
         configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                        "input={0}.W_all_a".format(name))
@@ -955,8 +952,6 @@ def _generate_lstm_config(self):
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
                        "dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name))
-        configs.append("component-node name={0}.c_trunc_memnorm component={0}.c_trunc_memnorm "
-                       "input={0}.c_trunc".format(name))
         configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                        "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index db9550818cd..71205961681 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -25,7 +25,6 @@
         'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
         'relu-dropout-layer': xlayers.XconfigBasicLayer,
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
-        'relu-memnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
         'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer,
         'sigmoid-layer' : xlayers.XconfigBasicLayer,
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index f83ad26f375..8a52b7b788c 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -164,8 +164,6 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new LstmNonlinearityComponent();
   } else if (component_type == "BatchNormComponent") {
     ans = new BatchNormComponent();
-  } else if (component_type == "MemoryNormComponent") {
-    ans = new MemoryNormComponent();
   } else if (component_type == "TimeHeightConvolutionComponent") {
     ans = new TimeHeightConvolutionComponent();
   } else if (component_type == "RestrictedAttentionComponent") {
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index d7595378c1f..882ef112919 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -274,7 +274,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
 
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
-      num_rows = RandInt(1, 100),
+      num_rows = RandInt(1, 20),
       rand_seed = Rand();
   int32 properties = c.Properties();
   CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
@@ -317,7 +317,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
   }
   KALDI_LOG << "Predicted objf-change = " << predicted_objf_change;
   KALDI_LOG << "Measured objf-change = " << measured_objf_change;
-  BaseFloat threshold = 0.1;
+  BaseFloat threshold = 0.05;
   bool ans = ApproxEqual(predicted_objf_change, measured_objf_change, threshold);
   if (!ans)
     KALDI_WARN << "Data-derivative test failed, component-type="
@@ -442,7 +442,7 @@ bool TestSimpleComponentModelDerivative(const Component &c,
 
 
 void UnitTestNnetComponent() {
-  for (int32 n = 0; n < 200; n++)  {
+  for (int32 n = 0; n < 2000; n++)  {
     Component *c = GenerateRandomSimpleComponent();
     KALDI_LOG << c->Info();
     TestNnetComponentIo(c);
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 12a4ec65ae9..23286211301 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -268,9 +268,9 @@ void NnetComputer::ExecuteCommand() {
             nnet_to_update = (computation_.need_model_derivative ?
                               nnet_to_update_ : NULL);
           } else {
-            // Some non-updatable components, such as CompositeComponent and
-            // MemoryNormComponent, store stats in the backprop.  For other
-            // types of component, this arg won't matter.
+            // Some non-updatable components, such as CompositeComponent, store
+            // stats in the backprop.  For other types of non-updatable
+            // component, this arg won't matter.
             nnet_to_update = nnet_to_store_stats_;
           }
           if (nnet_to_update)
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index ad5fc2466d4..e6be8210bb0 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -697,532 +697,5 @@ void BatchNormComponent::ZeroStats() {
 }
 
 
-
-
-/**
-   MEMORY_NORM_MATH
-
-   This comment describes the equations involved in 'memory-norm'.
-   memory-norm is like batch normalization, except instead of computing
-   everything on the current minibatch, we deal with decaying averages
-   over time, interpreted as expectations.  We'll firm up the math later.
-   The idea is to obtain a form of batch-norm that is compatible with
-   use in recurrent neural nets.
-
-   Everything is dimension by dimension here, so let's imagine the input and
-   output are one-dimensional.  Any index 'i' is going to be like a frame index
-   or an index referring to a sample.  We'll be writing down some expectations,
-   and we're rather cavalier with notation; these basically mean
-   exponentially-decaying weighted averages over time.
-
-   The input will be x(i), and the output y(i).
-
-   Each frame will have a weight, w(i) >= 0.  (these will be part of the
-   decaying averages)...
-
-   Let's define
-      count = \sum_i w(i)
-      sum =  \sum_i w(i) x(i)
-      sumsq =  \sum_i w(i) x(i)^2
-
-   We can compute:
-      mean = sum / count
-      var = epsilon + (sumsq / count) - (mean * mean)
-      scale = target_rms * var^{-0.5}
-
-      y(i) = (x(i) - mean) * scale.
-
-   We are given the derivatives of the objective function w.r.t. the
-   outputs; we'll write these as y'(i) [CAUTION: this is nonstandard
-   notation.  An apostrophe on something means the derivative of the
-   objective function w.r.t. that thing].
-
-   Over this data, with these weights, we can compute the derivative
-   of the objective w.r.t. the mean and the scale:
-
-       mean' = -scale * \sum_i w(i) y'(i)
-      scale' = \sum_i w(i) y'(i) (x(i) - mean)
-             = 1/scale \sum_i w(i) y'(i) y(i)
-        var' = -0.5 target_rms var^{-1.5} scale'
-             = -0.5 target_rms var^{-1.5} (1/scale) \sum_i w(i) y'(i) y(i)
-                 .. and using 1/scale = var^{0.5}/target_rms,
-             = -0.5 var^{-1} \sum_i w(i) y'(i) y(i)                      (*)
-
-
-   It will be convenient to write down 'per-frame' versions of all of these
-   quantities, which are divided by the total count:
-        mean_norm' = mean' / count
-        scale_norm' = scale' / count
-        var_norm' = var' / count
-   (we keep the apostrophe on these quantities as it clarifies that they
-   are derivatives of the objective function w.r.t something).
-
-    Now, 'var' can be written as:
-        var = epsilon + (1/count) \sum_i w(i) (x(i) - mean)^2
-    and the following formula is more convenient to propagate the derivative
-    back to an x(i).
-        Note: the following has 3 terms, which we can think of as
-     "direct term" (given fixed mean and scale),
-     "term via mean" (term that comes via derivative of the mean)
-     "term via scale" (term that comes via derivative of the scale)
-
-
-        x'(i) = y'(i)*scale + mean_norm' + 2 var_norm' (x(i) - mean)
-              = y'(i)*scale + mean_norm' + 2 var_norm' y(i) / scale
-               ... and substituting in the equation (*) above for var', using var_norm' = var'/scale,
-               and rearranging slightly:
-              = y'(i)*scale + mean_norm' - y(i) * var^{-1}/scale * 1/count * \sum_i w(i) y'(i) y(i)
-              .. and using scale=target-rms * var^{-0.5}, so var^{-1}/scale = var^{-0.5}/target-rms = scale/target-rms^2:
-              = y'(i)*scale + mean_norm' - y(i) * scale/(count*target-rms^2) * \sum_i w(i) y'(i) y(i)
-            .. and considering that the factor of 'scale' appears (directly or indirectly) in all 3
-              of the terms in the above expression, we can reorganize this as:
-              = scale * (y'(i) - 1/count*\sum_i w(i)*y(i) - 1/(count*target-rms^2) * \sum_i w(i) y'(i) y(i))
-*/
-
-
-void MemoryNormComponent::SetTestMode(bool test_mode) {
-  if (test_mode && stats_count_ <= 0) {
-    KALDI_WARN << "Refusing to set test-mode in MemoryNormComponent since no "
-        "stats are present.";
-    return;
-  }
-  test_mode_ = test_mode;
-}
-
-void MemoryNormComponent::Check() const {
-  KALDI_ASSERT(dim_ > 0 && block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-               epsilon_ > 0.0 && target_rms_ > 0.0 &&
-               stats_count_ >= 0.0 && backward_count_ >= 0.0);
-
-}
-
-MemoryNormComponent::MemoryNormComponent(const MemoryNormComponent &other):
-    dim_(other.dim_), block_dim_(other.block_dim_), epsilon_(other.epsilon_),
-    target_rms_(other.target_rms_),
-    include_indirect_derivative_(other.include_indirect_derivative_),
-    test_mode_(other.test_mode_),
-    stats_count_(other.stats_count_), backward_count_(other.backward_count_),
-    data_(other.data_) {
-  Check();
-}
-
-
-std::string MemoryNormComponent::Info() const {
-  std::ostringstream stream;
-  stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
-         << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
-         << ", include-indirect-derivative="
-         << (include_indirect_derivative_ ? "true" : "false")
-         << ", stats-count=" << stats_count_ << ", backward-count="
-         << backward_count_
-         << ", test-mode=" << (test_mode_ ? "true" : "false");
-  if (stats_count_ > 0.0) {
-    CuSubVector<BaseFloat> x_mean(data_, 0),
-        y_deriv(data_, 2), y_deriv_y(data_, 3),
-        scale(data_, 4);
-    if (stats_count_ > 0.0)
-      stream << ", x-mean=" << SummarizeVector(x_mean)
-             << ", scale=" << SummarizeVector(scale);
-    if (backward_count_ > 0.0)
-      stream << ", y-deriv=" << SummarizeVector(y_deriv)
-             << ", y-deriv-y=" << SummarizeVector(y_deriv_y);
-  }
-  return stream.str();
-}
-
-void MemoryNormComponent::InitFromConfig(ConfigLine *cfl) {
-  dim_ = -1;
-  block_dim_ = -1;
-  epsilon_ = 1.0e-03;
-  target_rms_ = 1.0;
-  include_indirect_derivative_ = true;
-  test_mode_ = false;
-
-  bool ok = cfl->GetValue("dim", &dim_);
-  cfl->GetValue("block-dim", &block_dim_);
-  cfl->GetValue("epsilon", &epsilon_);
-  cfl->GetValue("target-rms", &target_rms_);
-  cfl->GetValue("include-indirect-derivative", &include_indirect_derivative_);
-  cfl->GetValue("test-mode", &test_mode_);
-  if (!ok || dim_ <= 0) {
-    KALDI_ERR << "MemoryNormComponent must have 'dim' specified, and > 0";
-  }
-  if (block_dim_ == -1)
-    block_dim_ = dim_;
-  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0 &&
-        epsilon_ > 0 && target_rms_ > 0))
-    KALDI_ERR << "Invalid configuration in MemoryNormComponent.";
-  if (cfl->HasUnusedValues())
-    KALDI_ERR << "Could not process these elements in initializer: "
-              << cfl->UnusedValues();
-  stats_count_ = 0.0;
-  backward_count_ = 0.0;
-  data_.Resize(5, block_dim_);
-}
-
-
-
-void* MemoryNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
-                                     const CuMatrixBase<BaseFloat> &in,
-                                     CuMatrixBase<BaseFloat> *out) const {
-  KALDI_ASSERT(SameDim(in, *out) &&
-               (in.NumCols() == dim_ || in.NumCols() == block_dim_));
-  if (in.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(in.Stride() == in.NumCols() && out->Stride() == out->NumCols());
-    int32 ratio = dim_ / block_dim_, orig_rows = in.NumRows(),
-        orig_cols = in.NumCols(), new_rows = orig_rows * ratio,
-        new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat> in_reshaped(in.Data(), new_rows, new_cols, new_cols),
-        out_reshaped(out->Data(), new_rows, new_cols, new_cols);
-    return Propagate(indexes, in_reshaped, &out_reshaped);
-  }
-
-  if (out->Data() != in.Data())
-    out->CopyFromMat(in);
-
-  if (test_mode_ && stats_count_ <= 0.0)
-    KALDI_ERR << "Test mode set but no stats available.";
-
-  // From this point, we can assume that the num-cols of 'in' and 'out'
-  // equals block_dim_.
-  Memo *memo = NULL;
-  if (!test_mode_) {
-    memo = GetMemo(in);
-  }
-
-  if (test_mode_) {
-    CuSubVector<BaseFloat> x_mean(data_, 0), scale(data_, 4);
-    out->AddVecToRows(-1.0, x_mean);
-    out->MulColsVec(scale);
-  } else {
-    CuSubVector<BaseFloat> x_mean(memo->data, 0),
-        scale(memo->data, 4);
-    out->AddVecToRows(-1.0, x_mean);
-    out->MulColsVec(scale);
-  }
-  return memo;
-}
-
-
-MemoryNormComponent::Memo* MemoryNormComponent::GetMemo(
-    const CuMatrixBase<BaseFloat> &in) const {
-  KALDI_ASSERT(in.NumCols() == block_dim_ && !test_mode_ &&
-               stats_count_ >= 0.0);
-  Memo *memo = new Memo;
-  BaseFloat old_stats_count = stats_count_,
-      num_frames = in.NumRows(),
-      new_stats_count = num_frames + old_stats_count,
-      old_weight = old_stats_count / new_stats_count;
-
-  // The information in 'memo' will be copied to *this when
-  // StoreStats() is called (we can't update it in the Propagate()
-  // function for 'const' reasons).
-  memo->stats_count = new_stats_count;
-  memo->backward_count = backward_count_;
-  memo->data = data_;
-
-  CuSubVector<BaseFloat> x_mean(memo->data, 0),
-      x_uvar(memo->data, 1), scale(memo->data, 4);
-  // Each row of 'in' gets a weight of 1.0 / new_stats_count in the stats.
-  x_mean.AddRowSumMat(1.0 / new_stats_count, in, old_weight);
-  x_uvar.AddDiagMat2(1.0 / new_stats_count, in, kTrans, old_weight);
-
-  scale.CopyFromVec(x_uvar);
-  // we save a CUDA operation by applying the scale 'target_rms_scale' before doing
-  // ApplyPow(-0.5), and this requires taking it to the power -2.
-  BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_);
-  scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale);
-  // at this point, 'scale' is the variance.
-  scale.ApplyFloor(0.0);
-  scale.Add(epsilon_ * target_rms_scale);
-  scale.ApplyPow(-0.5);
-  // OK, now 'scale' is the scale.
-  return memo;
-}
-
-void MemoryNormComponent::Backprop(
-    const std::string &debug_info,
-    const ComponentPrecomputedIndexes *indexes,
-    const CuMatrixBase<BaseFloat> &in_value,  // unused.
-    const CuMatrixBase<BaseFloat> &out_value,
-    const CuMatrixBase<BaseFloat> &out_deriv,
-    void *memo_in,
-    Component *to_update_in,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-
-  KALDI_ASSERT(SameDim(out_deriv, *in_deriv) &&
-               (out_deriv.NumCols() == dim_ ||
-                out_deriv.NumCols() == block_dim_));
-  if (out_deriv.NumCols() != block_dim_) {
-    // if block_dim_ != dim_, we recurse; this helps keep the main code
-    // simple.
-    KALDI_ASSERT(out_deriv.Stride() == out_deriv.NumCols() &&
-                 in_deriv->Stride() == in_deriv->NumCols());
-    if (out_value.NumRows() != 0) {
-      KALDI_ASSERT(out_value.Stride() == out_value.NumCols());
-    }
-    int32 ratio = dim_ / block_dim_,
-        orig_rows = out_value.NumRows(),
-        orig_cols = out_value.NumCols(),
-        new_rows = orig_rows * ratio, new_cols = orig_cols / ratio;
-    CuSubMatrix<BaseFloat>
-        out_deriv_reshaped(out_deriv.Data(), new_rows, new_cols, new_cols),
-        in_deriv_reshaped(in_deriv->Data(), new_rows, new_cols, new_cols);
-
-    // we'll never use in_value, so pass it in unchanged.
-    if (out_value.NumRows() != 0) {
-      CuSubMatrix<BaseFloat> out_value_reshaped(out_value.Data(), new_rows,
-                                                new_cols, new_cols);
-      Backprop(debug_info, indexes, in_value,
-               out_value_reshaped, out_deriv_reshaped,
-               memo_in, to_update_in, &in_deriv_reshaped);
-    } else {
-      Backprop(debug_info, indexes, in_value,
-               out_value, out_deriv_reshaped,
-               memo_in, to_update_in, &in_deriv_reshaped);
-    }
-    return;
-  }
-
-  // assume in_deriv is non-NULL, because a non-updatable Component will not
-  // have the backprop called if the in_deriv is non-NULL.
-
-  if (test_mode_) {
-    // In test mode we treat it as a fixed scale and offset.
-    KALDI_ASSERT(memo_in == NULL && stats_count_ != 0.0);
-    // the following is a no-op if in_deriv and out_deriv are the same matrix.
-    in_deriv->CopyFromMat(out_deriv);
-    CuSubVector<BaseFloat> scale(data_, 4);
-    in_deriv->MulColsVec(scale);
-    return;
-  }
-
-  // OK, we're not in test mode.
-  // Before computing 'in_deriv', we may need to store some stats.
-  if (include_indirect_derivative_ && to_update_in != NULL) {
-    // Store some stats which are necessary to compute the 'indirect derivative'
-    // term (this is analogous to the part of the derivative in regular backprop
-    // that comes from the objf derivative w.r.t. the mean and variance stats).
-    //
-    // Note: instead of simply adding to the stats 'y_deriv' and 'y_deriv_y',
-    // the following equations do a kind of weighted combination, because
-    // these stats are stored normalized by the total count (backward_count_).
-    MemoryNormComponent *to_update =
-        dynamic_cast<MemoryNormComponent*>(to_update_in);
-    BaseFloat backward_count = to_update->backward_count_,
-        num_frames = in_deriv->NumRows(),
-        new_backward_count = backward_count + num_frames,
-        old_weight = backward_count / new_backward_count;
-    CuSubVector<BaseFloat> y_deriv(to_update->data_, 2),
-        y_deriv_y(to_update->data_, 3);
-    // The factor 1.0 / new_backward_count that appears below can be perhaps more
-    // clearly written as follows: first define
-    //       new_weight = num_frames / new_backward_count
-    // and then write new_weight / num_frames, which simplifies to
-    // 1.0 / new_backward_count.  The factor of 1.0 / num_frames is necessary to
-    // convert from data sums to a per-frame average.
-    y_deriv.AddRowSumMat(1.0 / new_backward_count, out_deriv, old_weight);
-    y_deriv_y.AddDiagMatMat(1.0 / new_backward_count, out_deriv, kTrans,
-                            out_value, kNoTrans, old_weight);
-    to_update->backward_count_ = new_backward_count;
-
-    // Now 'to_update' will typically be the same as 'this', so we need
-    // to compute the derived parameters because it affects some code that's
-    // below.
-    to_update->ComputeDerived();
-  }
-
-  // the following does no work if in_deriv and out_deriv are the same matrix.
-  in_deriv->CopyFromMat(out_deriv);
-
-  if (this->backward_count_ != 0.0) {
-    CuSubVector<BaseFloat> y_deriv(data_, 2),
-        y_deriv_y(data_, 3);
-    in_deriv->AddVecToRows(-1.0, y_deriv);
-    in_deriv->AddMatDiagVec(-1.0 / (target_rms_ * target_rms_),
-                            out_value, kNoTrans, y_deriv_y);
-  }
-  CuSubVector<BaseFloat> scale(data_, 4);
-  in_deriv->MulColsVec(scale);
-}
-
-
-void MemoryNormComponent::ComputeDerived() {
-  KALDI_ASSERT(stats_count_ >= 0.0 && data_.NumRows() == 5);
-  if (stats_count_ == 0.0) {
-    // zero 'scale'.
-    data_.Row(4).SetZero();
-    return;
-  }
-  CuSubVector<BaseFloat>  x_mean(data_, 0), x_uvar(data_, 1),
-       scale(data_, 4);
-  scale.CopyFromVec(x_uvar);
-  // we save a CUDA operation by applying the scale 'target_rms_scale' before doing
-  // ApplyPow(-0.5), and this requires taking it to the power -2.
-  BaseFloat target_rms_scale = 1.0 / (target_rms_ * target_rms_);
-  scale.AddVecVec(-target_rms_scale, x_mean, x_mean, target_rms_scale);
-  // at this point, 'scale' is the variance (divided by target_rms^2).
-  scale.ApplyFloor(0.0);
-  scale.Add(epsilon_ * target_rms_scale);
-  scale.ApplyPow(-0.5);
-}
-
-void MemoryNormComponent::StoreStats(
-    const CuMatrixBase<BaseFloat> &, // in_value
-    const CuMatrixBase<BaseFloat> &, // out_value
-    void *memo_in) {
-  // in test mode this component does not store stats; it doesn't provide the
-  // kStoresStats flag so this function won't be called.
-  KALDI_ASSERT(!test_mode_ && memo_in != NULL && stats_count_ >= 0.0);
-
-  // We don't actually need 'in_value' and 'out_value', as the
-  // required statistics are already stored in 'memo_in'.
-  Memo *memo = static_cast<Memo*>(memo_in);
-
-  // check that the memo's stats count is more than our stats_count_,
-  // which it should be because the memo should have added extra stats,
-  // and StoreStats() should be called directly after the Propagate()
-  // function.
-  // This could possibly fail with memo_in->stats_count == stats_count_
-  // due to roundoff, if you trained with batchnorm-stats-scale set at 1,
-  // but that would be a poor choice of parameters anyway as
-  // roundoff would be a big problem.
-  KALDI_ASSERT(memo->stats_count > stats_count_);
-
-  stats_count_ = memo->stats_count;
-  // Copying the entire data matrix should be safe because
-  // StoreStats() is always called directly after the corresponding
-  // Propagate(), and on the same object; and there should be
-  // no possibility that other things in this->data_ changed in
-  // the interim.
-  data_.CopyFromMat(memo->data);
-}
-
-void MemoryNormComponent::Read(std::istream &is, bool binary) {
-  ExpectOneOrTwoTokens(is, binary, "<MemoryNormComponent>", "<Dim>");
-  ReadBasicType(is, binary, &dim_);
-  ExpectToken(is, binary, "<BlockDim>");
-  ReadBasicType(is, binary, &block_dim_);
-  ExpectToken(is, binary, "<Epsilon>");
-  ReadBasicType(is, binary, &epsilon_);
-  ExpectToken(is, binary, "<TargetRms>");
-  ReadBasicType(is, binary, &target_rms_);
-  ExpectToken(is, binary, "<IncludeIndirectDerivative>");
-  ReadBasicType(is, binary, &include_indirect_derivative_);
-  ExpectToken(is, binary, "<TestMode>");
-  ReadBasicType(is, binary, &test_mode_);
-  ExpectToken(is, binary, "<StatsCount>");
-  ReadBasicType(is, binary, &stats_count_);
-  ExpectToken(is, binary, "<BackwardCount>");
-  ReadBasicType(is, binary, &backward_count_);
-  ExpectToken(is, binary, "<Data>");
-  data_.Read(is, binary);
-  ExpectToken(is, binary, "</MemoryNormComponent>");
-  Check();
-}
-
-void MemoryNormComponent::Write(std::ostream &os, bool binary) const {
-  Check();
-  WriteToken(os, binary, "<MemoryNormComponent>");
-  WriteToken(os, binary, "<Dim>");
-  WriteBasicType(os, binary, dim_);
-  WriteToken(os, binary, "<BlockDim>");
-  WriteBasicType(os, binary, block_dim_);
-  WriteToken(os, binary, "<Epsilon>");
-  WriteBasicType(os, binary, epsilon_);
-  WriteToken(os, binary, "<TargetRms>");
-  WriteBasicType(os, binary, target_rms_);
-  WriteToken(os, binary, "<IncludeIndirectDerivative>");
-  WriteBasicType(os, binary, include_indirect_derivative_);
-  WriteToken(os, binary, "<TestMode>");
-  WriteBasicType(os, binary, test_mode_);
-  WriteToken(os, binary, "<StatsCount>");
-  WriteBasicType(os, binary,  stats_count_);
-  WriteToken(os, binary, "<BackwardCount>");
-  WriteBasicType(os, binary,  backward_count_);
-  WriteToken(os, binary, "<Data>");
-  data_.Write(os, binary);
-  WriteToken(os, binary, "</MemoryNormComponent>");
-}
-
-void MemoryNormComponent::Scale(BaseFloat scale) {
-  if (scale <= 0) {
-    if (scale < 0.0)
-      KALDI_WARN << "Setting stats to zero in MemoryNormComponent: requested scale = "
-                 << scale;
-    // If scale is negative we zero the stats.  This may not always be the right
-    // thing to do, so we warn.
-    data_.SetZero();
-    stats_count_ = 0.0;
-    backward_count_ = 0.0;
-  } else {
-    stats_count_ *= scale;
-    backward_count_ *= scale;
-    // 'data_' doesnt need to be changed, as all the quantities it contains are
-    // normalized by the count.
-  }
-}
-
-
-void MemoryNormComponent::Add(BaseFloat alpha, const Component &other_in) {
-  const MemoryNormComponent *other =
-      dynamic_cast<const MemoryNormComponent*>(&other_in);
-
-  static bool warned = false;
-  if (alpha < 0.0) {
-    if (!warned) {
-      warned = true;
-      KALDI_WARN << "Adding MemoryNormComponent with negative scale: will do nothing "
-                 << "(will not warn again).";
-    }
-    return;
-  }
-
-  if (alpha * other->stats_count_ == 0.0 &&
-      alpha * other->backward_count_ == 0.0)
-    return;
-
-  BaseFloat
-      new_stats_count = stats_count_ + alpha * other->stats_count_,
-      new_backward_count = backward_count_ + alpha * other->backward_count_;
-
-  if (new_stats_count > 0.0) {
-    // This block sets rows 0 and 1 of data_, which we call 'x_mean' and
-    // 'x_uvar, to the appropriate weighted combination of 'this' and 'other'.
-    BaseFloat this_scale = stats_count_ / new_stats_count,
-        other_scale = alpha * other->stats_count_ / new_stats_count;
-    data_.RowRange(0, 2).Scale(this_scale);
-    data_.RowRange(0, 2).AddMat(other_scale, other->data_.RowRange(0, 2));
-  }
-  if (new_backward_count > 0.0) {
-    // This block sets rows 2 and 3 of data_, which we call 'y_deriv' and
-    // 'y_deriv_y', to the appropriate weighted combination of 'this' and
-    // 'other'.
-    BaseFloat this_scale = backward_count_ / new_backward_count,
-        other_scale = alpha * other->backward_count_ / new_backward_count;
-    data_.RowRange(2, 2).Scale(this_scale);
-    data_.RowRange(2, 2).AddMat(other_scale, other->data_.RowRange(2, 2));
-  }
-  stats_count_ = new_stats_count;
-  backward_count_ = new_backward_count;
-  ComputeDerived();
-}
-
-void MemoryNormComponent::ZeroStats() {
-  // We only zero the stats if we're not in test mode.  In test mode, this would
-  // be dangerous as the stats aren't really considered to be stats, they become
-  // a fixed part of the model.
-  if (!test_mode_) {
-    stats_count_ = 0.0;
-    backward_count_ = 0.0;
-    data_.SetZero();
-  }
-}
-
-
-
-
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 84b5dbd817a..b10c3e4a60c 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -37,17 +37,19 @@ namespace nnet3 {
 /// @file  nnet-normalize-component.h
 ///
 ///   This file contains declarations of components that in one way or
-///   another normalize their input: NormalizeComponent, BatchNormComponent,
-///   and MemoryNormComponent.
+///   another normalize their input: NormalizeComponent and BatchNormComponent.
 
 /*
-   Implements the function:
+   NormalizeComponent implements the function:
 
          y = x * (sqrt(dim(x)) * target-rms) / |x|
 
-    where |x| is the 2-norm of the vector x.  I.e. its output is its input
-    scaled such that the root-mean-square values of its elements equals
-    target-rms.  (As a special case, if the input is zero, it outputs zero).
+   where |x| is the 2-norm of the vector x.  I.e. its output is its input
+   scaled such that the root-mean-square values of its elements equals
+   target-rms.  (As a special case, if the input is zero, it outputs zero).
+   This is like Hinton's layer-norm, except not normalizing the mean, only
+   the variance.
+
 
     Note: if you specify add-log-stddev=true, it adds an extra element to
      y which equals log(|x| / sqrt(dim(x))).
@@ -299,237 +301,6 @@ class BatchNormComponent: public Component {
 };
 
 
-/*
-  MemoryNormComponent is like batch normalization, except the stats
-  are accumulated as a weighted sum over past minibatches (if this is
-  not the first minibatch), instead of over the current minibatch.
-  Caution: we don't test this component in the standard way because it
-  would fail the derivative tests.
-
-  You can use it in the same way you would normally use BatchNormComponent.
-
-  Accepted configuration values:
-         dim          Dimension of the input and output
-         block-dim    Defaults to 'dim', but may be set to a nonzero divisor
-                      of 'dim'.  In this case, each block of dimension 'block-dim'
-                      is treated like a separate row of the input matrix, which
-                      means that the stats from n'th element of each
-                      block are pooled into one class, for each n.a
-         power        Power that determines the scale we apply, as a function of
-                      the variance.  The default, -0.5, corresponds to regular
-                      BatchNorm.
-         epsilon      Small term added to the variance that is used to prevent
-                      division by zero
-         target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
-                      it will normalize the standard deviation of the output to
-                      2.0. 'target-stddev' might be a more suitable name, but this
-                      was chosen for consistency with NormalizeComponent.
-         include-indirect-derivative  This defaults to true, which means we
-                      include the (smaller) derivative term that comes via the
-                      mean and variance estimation.  You might want to set this to
-                      false for testing purposes.
- */
-class MemoryNormComponent: public Component {
- public:
-
-  MemoryNormComponent() { }
-
-  // constructor using another component
-  MemoryNormComponent(const MemoryNormComponent &other);
-
-  virtual int32 InputDim() const { return dim_; }
-  virtual int32 OutputDim() const { return dim_; }
-
-  virtual std::string Info() const;
-  virtual void InitFromConfig(ConfigLine *cfl);
-  virtual std::string Type() const { return "MemoryNormComponent"; }
-  virtual int32 Properties() const {
-    // If the block-dim is less than the dim, we need the input and output
-    // matrices to be contiguous (stride==num-cols), as we'll be reshaping
-    // internally.  This is not much of a cost, because this will be used
-    // in convnets where we have to do this anyway.
-    bool iid = include_indirect_derivative_;
-    return kSimpleComponent|kPropagateInPlace|kBackpropInPlace|
-        (test_mode_ ? 0 : kUsesMemo|kStoresStats|(iid?kBackpropNeedsOutput:0))|
-        (block_dim_ < dim_ ? kInputContiguous|kOutputContiguous : 0);
-
-  }
-
-  // Call this function to set 'test mode' to true or false.  In test
-  // mode the stats are frozen and will not be updated.
-  void SetTestMode(bool test_mode);
-
-
-  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
-                         const CuMatrixBase<BaseFloat> &in,
-                         CuMatrixBase<BaseFloat> *out) const;
-
-  /// The backprop function.  In addition to propagating the input back to
-  /// 'in_deriv', if supplied, this function also updates, in 'to_update',
-  /// backward_count_ and the rows named 'y_deriv' and 'y_deriv_y' of
-  /// data_, and also the derived quantities 'x_deriv' and 'scale_deriv'
-  /// of data_.
-  /// (note: in training, 'to_update' will point to delta_nnet_, and later these
-  /// stats get added to nnet_ via Add())
-  virtual void Backprop(const std::string &debug_info,
-                        const ComponentPrecomputedIndexes *indexes,
-                        const CuMatrixBase<BaseFloat> &, // in_value
-                        const CuMatrixBase<BaseFloat> &out_value,
-                        const CuMatrixBase<BaseFloat> &out_deriv,
-                        void *memo,
-                        Component *to_update,
-                        CuMatrixBase<BaseFloat> *in_deriv) const;
-
-  virtual void Read(std::istream &is, bool binary); // This Read function
-  // requires that the Component has the correct type.
-
-  /// Write component to stream
-  virtual void Write(std::ostream &os, bool binary) const;
-  virtual Component* Copy() const { return new MemoryNormComponent(*this); }
-
-  // Note: if you scale by a negative number it will set stats to zero
-  // rather than allow a negative stats count.
-  virtual void Scale(BaseFloat scale);
-  // Note: if you try to add with negative coefficient (as in backstitch), it
-  // will do nothing.
-  virtual void Add(BaseFloat alpha, const Component &other);
-  virtual void ZeroStats();
-
-  virtual void DeleteMemo(void *memo) const { delete static_cast<Memo*>(memo); }
-
-  /// This function updates stats_count_, the rows named 'x_mean', 'x_uvar'
-  /// of data_, and also the derived quantities stored in the rows named
-  /// 'scale', 'x_deriv' and 'scale_deriv' of data_.
-  /// (note: in training, this is called on the delta_nnet_, and later
-  /// the stats get added to nnet_ via Add())
-  virtual void StoreStats(const CuMatrixBase<BaseFloat> &, // in_value
-                          const CuMatrixBase<BaseFloat> &, // out_value
-                          void *memo);
-
- private:
-
-  struct Memo {
-    // 'stats_count' is the same as stats_count_ in the MemoryNormComponent
-    // from whose Propagate() function this memo was generated, plus
-    // the number of frames we're propagating (this is after any reshaping
-    // if block_dim_ != dim_).
-    BaseFloat stats_count;
-
-    // 'stats_count' is the same as stats_count_ in the MemoryNormComponent
-    // from whose Propagate() function this memo was generated.  It's mainly
-    // included because the backprop code wants to see if this was nonzero.
-    BaseFloat backward_count;
-
-    // The structure of 'data' is the same as the data_ member of
-    // MemoryNormComponent; it's a matrix of dimension 5 by block_dim_.
-    // It will differ from the data_ member of the component we generated this
-    // from by the addition of some extra data in the 'x_sum' and 'x_sumsq'
-    // stats, and a corresponding modification of the 'scale', 'x_deriv'
-    // and 'scale_deriv' quantities.
-    //
-    // (note: the reason we update the stats before propagation rather
-    // than after, is for stability: otherwise, with relu units, if we only
-    // update the stats after the propagation we get a particular pathology: if
-    // a unit was previously always zero it will get a big scale; and if then we
-    // start getting some nonzero output, the scale on it will be too large.)
-    CuMatrix<BaseFloat> data;
-  };
-
-
-  /// This piece of code, which has been broken out from Propagate(), computes
-  /// the memo.  Expects in.NumCols() == block_dim_.  It should only be called
-  /// if test_mode_ is false.
-  Memo *GetMemo(const CuMatrixBase<BaseFloat> &in) const;
-
-  /// This function computes certain members of data_ that are derived:
-  /// specifically, rows 4, 5 and 6, which are called 'scale', 'x_deriv' and
-  /// 'scale_deriv'.
-  void ComputeDerived();
-
-  void Check() const;
-
-  // this function is used in a couple of places; it turns the raw stats into
-  // the offset/scale term of a normalizing transform.
-  static void ComputeOffsetAndScale(BaseFloat count,
-                                    BaseFloat epsilon,
-                                    const Vector<BaseFloat> &stats_sum,
-                                    const Vector<BaseFloat> &stats_sumsq,
-                                    Vector<BaseFloat> *offset,
-                                    Vector<BaseFloat> *scale);
-
-  // Dimension of the input and output.
-  int32 dim_;
-
-  // block_dim_ would normally be the same as dim_, but if it's less (and it
-  // must be > 0 and must divide dim_), then each separate block of the input of
-  // dimension 'block_dim_' is treated like a separate frame for the purposes of
-  // normalization.  This can be used to implement spatial batch normalization
-  // for convolutional setups-- assuming the filter-dim has stride 1, which it
-  // always will in the new code in nnet-convolutional-component.h.
-  int32 block_dim_;
-
-  // Used to avoid exact-zero variances, epsilon has the dimension of a
-  // covariance.
-  BaseFloat epsilon_;
-
-  // This controls the dynamic range of the output.  At 1.0 which is the
-  // default, the output has unit standard deviation, but you can set it to
-  // other values.  The same config exists in NormalizeComponent.
-  BaseFloat target_rms_;
-
-  // If true, we include the smaller indirect part of the derivative, that comes
-  // via the stats estimation.  This is included mostly for testing purposes; we
-  // expect this will normally be true.
-  bool include_indirect_derivative_;
-
-  // If test_mode_ is set, no stats will be accumulated.  It's an error if
-  // test_mode_ is set and the data count is zero, and you try to propagate.
-  bool test_mode_;
-
-  // The total count of stats stored by StoreStats(), and which are represented
-  // in x_mean = data_.Row(0) and x_uvar = data_.Row(1).  We never allow this to
-  // become less than zero, even if people do unexpected things with Add() and
-  // Scale().
-  BaseFloat stats_count_;
-
-  // backward_count_ is the total count of stats accumulated during backprop,
-  // and represents the count correspondsing to the stats in 'y_deriv' and
-  // 'y_deriv_y'.  It is expected to be either zero or the same as stats_count_,
-  // in most circumstances, depending whether you were doing backprop or just
-  // inference-- but we don't enforce this because there may be situations where
-  // this is not the case.
-  //
-  // We never allow this to become less than zero, even if people do unexpected
-  // things with Add() and Scale().
-  BaseFloat backward_count_;
-
-  // We store data_ as a single matrix because it enables certain operations
-  // to be done using fewer kernels, but it contains various different quantities,
-  // which we'll describe below as if they were separate variables.
-  // data_ is of dimension 5 by block_dim_.
-  CuMatrix<BaseFloat> data_;
-  // data_.Row(0) is 'x_mean', which is the decaying moving-average of
-  //             input data x; or zero if stats_count_ is zero.
-  // data_.Row(1) is 'x_uvar', which is the decaying moving-average of
-  //             input data x^2 or zero if stats_count_ is zero.
-  // data_.Row(2) is 'y_deriv', which is the decaying moving-average
-  //           derivative of the objective w.r.t. the output y; or
-  //           zero if backward_count_ is zero.
-  // data_.Row(3) is 'y_deriv_y', which  the decaying moving average
-  //           of the product of the output times (the derivative of the
-  //           objective w.r.t. the output); or zero if backward_count_
-  //           is zero.
-  //
-  // The quantity below is derived from the stats above.
-  //
-  // data_.Row(4) is 'scale', which is the inverse square root of the
-  //            covariance computed from x_mean and x_uvar (plus epsilon),
-  //            or zero if stats_count_ is zero.
-};
-
-
-
-
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index da14c188244..7d84c2e9518 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -4981,12 +4981,6 @@ void CompositeComponent::Init(const std::vector<Component*> &components,
   max_rows_process_ = max_rows_process;
 
   for (size_t i = 0; i < components_.size(); i++) {
-    if (components_[i]->Type() == "MemoryNormComponent") {
-      // This is out of concerns about the fact that the stats accumulation
-      // is done in the backprop, not in the forward propagation.
-      KALDI_ERR << "MemoryNormComponent cannot currently exist inside "
-          "CompositeComponent";
-    }
     // make sure all constituent components are simple.
     KALDI_ASSERT(components_[i]->Properties() & kSimpleComponent);
     if (i > 0) {
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 781b4e558e9..472a02197e5 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1675,11 +1675,11 @@ static void GenerateRandomComponentConfig(std::string *component_type,
     // labels to the most recently added component, so it gets tested more
     case 31: {
       *component_type = "BatchNormComponent";
-      int32 block_dim = RandInt(1, 10), dim = block_dim * RandInt(1, 2);
+      int32 block_dim = RandInt(1, 20), dim = block_dim * RandInt(1, 2);
       bool test_mode = (RandInt(0, 1) == 0);
       os << " dim=" << dim
          << " block-dim=" << block_dim << " target-rms="
-         << RandInt(1, 2) << " test-mode="
+         << RandInt(1, 4) << " test-mode="
          << (test_mode ? "true" : "false")
          << " power=" << (-0.1 * RandInt(3, 5))
          << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 2a92e1f5a44..488a711e09d 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -492,8 +492,7 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
 bool HasBatchnorm(const Nnet &nnet) {
   for (int32 c = 0; c < nnet.NumComponents(); c++) {
     const Component *comp = nnet.GetComponent(c);
-    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL ||
-        dynamic_cast<const MemoryNormComponent*>(comp) != NULL)
+    if (dynamic_cast<const BatchNormComponent*>(comp) != NULL)
       return true;
   }
   return false;
@@ -509,9 +508,6 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->Scale(batchnorm_stats_scale);
-    MemoryNormComponent *mc = dynamic_cast<MemoryNormComponent*>(comp);
-    if (mc != NULL)
-      mc->Scale(batchnorm_stats_scale);
   }
 }
 
@@ -536,9 +532,6 @@ void SetBatchnormTestMode(bool test_mode,  Nnet *nnet) {
     BatchNormComponent *bc = dynamic_cast<BatchNormComponent*>(comp);
     if (bc != NULL)
       bc->SetTestMode(test_mode);
-    MemoryNormComponent *mc = dynamic_cast<MemoryNormComponent*>(comp);
-    if (mc != NULL)
-      mc->SetTestMode(test_mode);
   }
 }
 
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index 6f9b6cb959f..fc1631a8d77 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -168,12 +168,10 @@ std::string NnetInfo(const Nnet &nnet);
 void SetDropoutProportion(BaseFloat dropout_proportion, Nnet *nnet);
 
 
-/// Returns true if nnet has at least one component of type
-/// BatchNormComponent or MemoryNormComponent
+/// Returns true if nnet has at least one component of type BatchNormComponent.
 bool HasBatchnorm(const Nnet &nnet);
 
-/// This function affects only components of type BatchNormComponent or
-/// MemoryNormComponent.
+/// This function affects only components of type BatchNormComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed
 /// often).  "test mode" means that instead of using statistics from the batch,
@@ -445,8 +443,8 @@ void ApplyL2Regularization(const Nnet &nnet,
 
 /**
    This function scales the batchorm stats of any batchnorm components
-   (components of type BatchNormComponent or MemoryNormComponent) in 'nnet' by
-   the scale 'batchnorm_stats_scale'.
+   (components of type BatchNormComponent) in 'nnet' by the scale
+   'batchnorm_stats_scale'.
  */
 void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
                          Nnet *nnet);

From f654281ef57952b1a37a9fc48ce45ef7ac7e6cec Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 29 Dec 2017 20:44:32 -0800
Subject: [PATCH 040/184] [src,scripts] Removing diagonal extension to natural
 gradient code (did not seem helpful)

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |   5 +-
 src/nnet3/natural-gradient-online.cc          | 137 +-----------------
 src/nnet3/natural-gradient-online.h           | 112 --------------
 src/nnet3/nnet-simple-component.cc            |  61 +-------
 src/nnet3/nnet-simple-component.h             |  25 ----
 5 files changed, 9 insertions(+), 331 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 6393709d82e..c59e4a6041e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -686,8 +686,6 @@ def set_default_configs(self):
                        'bias-stddev': '',
                        'l2-regularize': '',
                        'learning-rate-factor': '',
-                       'diagonal-power-in': '',
-                       'diagonal-power-out': '',
                        'max-change': 0.75 }
 
     def check_configs(self):
@@ -760,8 +758,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
 
         affine_options = self.config['ng-affine-options']
         for opt_name in [ 'max-change', 'learning-rate-factor',
-                          'bias-stddev', 'l2-regularize',
-                          'diagonal-power-in', 'diagonal-power-out' ]:
+                          'bias-stddev', 'l2-regularize' ]:
             value = self.config[opt_name]
             if value != '':
                 affine_options += ' {0}={1}'.format(opt_name, value)
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 83702626f5f..19a7d5fafdc 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -28,8 +28,7 @@ OnlineNaturalGradient::OnlineNaturalGradient():
     rank_(40), update_period_(1), num_samples_history_(2000.0),
     num_minibatches_history_(0.0), alpha_(4.0),
     epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0),
-    self_debug_(false),
-    diagonal_power_(0.0), diagonal_epsilon_(1.0e-03)  { }
+    self_debug_(false) { }
 
 
 /**
@@ -182,14 +181,10 @@ void OnlineNaturalGradient::PreconditionDirections(
   bool updating = Updating();
 
   BaseFloat initial_product;
-  if (diagonal_power_ == 0.0 || scale != NULL)
-    initial_product = TraceMatMat(*X_t, *X_t, kTrans);
+  initial_product = TraceMatMat(*X_t, *X_t, kTrans);
 
-  if (diagonal_power_ == 0.0)
-    PreconditionDirectionsInternal(rho_t, initial_product,
-                                   updating, d_t, &WJKL_t, X_t);
-  else
-    PreconditionDirectionsDiagonal(rho_t, updating, d_t, &WJKL_t, X_t);
+  PreconditionDirectionsInternal(rho_t, initial_product,
+                                 updating, d_t, &WJKL_t, X_t);
 
   if (scale) {
     if (initial_product <= 0.0) {
@@ -202,123 +197,6 @@ void OnlineNaturalGradient::PreconditionDirections(
   t_ += 1;
 }
 
-void OnlineNaturalGradient::PreconditionDirectionsDiagonal(
-    const BaseFloat rho_t,
-    bool updating,
-    const Vector<BaseFloat> &d_t,
-    CuMatrixBase<BaseFloat> *WJKL_t,
-    CuMatrixBase<BaseFloat> *X_t) {
-  KALDI_ASSERT(diagonal_power_ > 0.0 && diagonal_power_ <= 1.0 &&
-               (diagonal_mean_.Dim() != 0 || updating));
-
-  int32 dim = X_t->NumCols();
-
-  if (diagonal_mean_.Dim() == 0) {
-    InitDiagonalParams(*X_t);
-    updating = false;
-  }
-
-  CuVector<BaseFloat> new_diagonal_mean, new_diagonal_uvar;
-
-  if (updating) {
-    new_diagonal_mean.Resize(dim, kUndefined);
-    new_diagonal_uvar.Resize(dim, kUndefined);
-    UpdateDiagonalStats(*X_t, &new_diagonal_mean, &new_diagonal_uvar);
-  }
-
-  X_t->MulColsVec(diagonal_scale_);
-
-  PreconditionDirectionsInternal(rho_t, TraceMatMat(*X_t, *X_t, kTrans), false,
-                                 d_t, WJKL_t, X_t);
-
-  // We apply the scale both before and after the identity-plus-low-rank matrix,
-  // so that the combined matrix is symmetric.
-  X_t->MulColsVec(diagonal_scale_);
-
-
-  // If we're updating the diagonal mean and variance we do so *after*
-  // preconditioning the data.  This is out of a concern about the provability
-  // of convergence (making it independent of the current minibatch).  Most
-  // likely, in practice it would work fine updating it before, it might even be
-  // a little bit more stable.  Anyway, this is how we're doing it, and it's how
-  // we did it for the core part of the natural gradient.
-  if (updating) {
-    diagonal_mean_.Swap(&new_diagonal_mean);
-    diagonal_uvar_.Swap(&new_diagonal_uvar);
-    UpdateDiagonalScale();
-  }
-}
-
-void OnlineNaturalGradient::UpdateDiagonalStats(
-    const CuMatrixBase<BaseFloat> &X,
-    CuVectorBase<BaseFloat> *diagonal_mean_new,
-    CuVectorBase<BaseFloat> *diagonal_uvar_new){
-  int32 dim = X.NumCols(), num_rows = X.NumRows();
-  KALDI_ASSERT(diagonal_mean_new->Dim() == dim && diagonal_uvar_new->Dim() == dim &&
-               diagonal_mean_.Dim() == dim);
-  BaseFloat eta = Eta(X.NumRows());
-  // 'eta' is a value that reflects how fast we update these stats, which is
-  // smaller if we're updating them slower, but strictly less than 1.  It's
-  // basically the scale on the new stats, with 1-eta being the scale on the old
-  // stats.
-  KALDI_ASSERT(eta > 0 && eta < 1.0);
-
-  diagonal_mean_new->CopyFromVec(diagonal_mean_);
-  diagonal_uvar_new->CopyFromVec(diagonal_uvar_);
-
-  diagonal_mean_new->AddRowSumMat(eta / num_rows, X, 1.0 - eta);
-  diagonal_uvar_new->AddDiagMat2(eta / num_rows, X, kTrans, 1.0 - eta);
-}
-
-void OnlineNaturalGradient::InitDiagonalParams(
-    const CuMatrixBase<BaseFloat> &X) {
-  int32 dim = X.NumCols(), num_rows = X.NumRows();
-  diagonal_mean_.Resize(dim);
-  diagonal_uvar_.Resize(dim);
-  diagonal_mean_.AddRowSumMat(1.0 / num_rows, X, 0.0);
-  diagonal_uvar_.AddDiagMat2(1.0 / num_rows, X, kTrans, 0.0);
-  UpdateDiagonalScale();
-}
-
-
-void OnlineNaturalGradient::UpdateDiagonalScale() {
-  KALDI_ASSERT(diagonal_mean_.Dim() != 0);
-  int32 dim = diagonal_mean_.Dim();
-  if (diagonal_scale_.Dim() != dim)
-    diagonal_scale_.Resize(dim);
-  diagonal_scale_.CopyFromVec(diagonal_uvar_);
-  // Because the last element may be the offset, and it doesn't
-  // make sense to subtract its mean for this purpose, only do this for
-  // the all-but-last-elements.
-  if (dim > 1) {
-    CuSubVector<BaseFloat> diagonal_scale_part(diagonal_scale_, 0, dim - 1),
-        diagonal_mean_part(diagonal_mean_, 0, dim - 1);
-    diagonal_scale_part.AddVecVec(-1.0, diagonal_mean_part,
-                                  diagonal_mean_part, 1.0);
-  }
-
-  // At this point, diagonal_scale_ is the diagonal of the (centered) variance
-  // estimated from the x and x2 statistics, prior to any flooring or
-  // scaling.
-  BaseFloat avg_variance = diagonal_scale_.Sum() / dim;
-  if (avg_variance <= 1.0e-20) {
-    // either the data is all zero or very tiny, or something went wrong.  Just
-    // set diagonal_scale_ to a constant.
-    diagonal_scale_.Set(1.0);
-  } else {
-    BaseFloat floor = diagonal_epsilon_ * avg_variance;
-    diagonal_scale_.ApplyFloor(floor);
-    // The following statement scales diagonal_scale_ so its average is close to
-    // 1, which helps keep things in a reasonable numeric range.  There is no
-    // reason why it has to be exactly one, and the whole thing is mathematically
-    // invariant to this scaling factor-- we output the scaling factor 'scale'
-    // from PreconditionDirections() so that the user can rescale so the vector
-    // 2-norm of the X_t matrix is the same as was before the natural gradent.
-    diagonal_scale_.Scale(1.0 / avg_variance);
-    diagonal_scale_.ApplyPow(-0.5 * diagonal_power_);
-  }
-}
-
 void OnlineNaturalGradient::ReorthogonalizeXt1(
     const VectorBase<BaseFloat> &d_t1,
     BaseFloat rho_t1,
@@ -701,12 +579,7 @@ OnlineNaturalGradient::OnlineNaturalGradient(const OnlineNaturalGradient &other)
     alpha_(other.alpha_), epsilon_(other.epsilon_), delta_(other.delta_),
     frozen_(other.frozen_), t_(other.t_),
     self_debug_(other.self_debug_), W_t_(other.W_t_),
-    rho_t_(other.rho_t_), d_t_(other.d_t_),
-    diagonal_power_(other.diagonal_power_),
-    diagonal_epsilon_(other.diagonal_epsilon_),
-    diagonal_mean_(other.diagonal_mean_),
-    diagonal_uvar_(other.diagonal_uvar_),
-    diagonal_scale_(other.diagonal_scale_) { }
+    rho_t_(other.rho_t_), d_t_(other.d_t_) { }
 
 
 OnlineNaturalGradient& OnlineNaturalGradient::operator = (
diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index f2713063492..0b05948977e 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -411,43 +411,6 @@ namespace nnet3 {
    is that this isn't going to be a problem.
  */
 
-/**
-   DIAGONAL_EXTENSION
-
-   This comment explains the diagonal extension to the natural gradient method (this
-   was not described in the original paper).
-
-   Physically this diagonal scaling happens both before and after the main natural
-   gradient code.  I.e. the main natural gradient code (which makes use
-   of a scaled-unit-plus-low-rank factorization), happens inside the
-   space where we've applied the diagonal component of the preconditioning,
-   so the overall natural-gradient matrix is of the form:
-      diag scaled-unit-plus-low-rank diag.
-  The way this is estimated only really makes sense if diagonal_power_
-  is either zero or one, but I expect that for in-between values it will
-  work fine in practice.
-
-  The way the diagonal scaling factor is estimated is that we accumulate mean
-  and variance stats for each dimension (decaying over time like the previous
-  natural gradient stats), and set the scaling factor to some power of the
-  variance estimated this way.  The power of the variance used to get the
-  scaling factor is actually -0.5 times diagonal_power_, the factor of 0.5
-  being required because the scaling is applied twice, both before and after
-  the scaled-unit-plus-low-rank inverse-Fisher matrix, to preserve symmetry.
-
-  It may seem odd that we are taking into account the mean here, while
-  conceptually it's the uncentered covariance of the vectors that we're
-  modeling.  The reason is that any offset in the vectors we're modeling
-  can be taken into account by one of the eigenvectors of the low-rank
-  matrix, so we anticipate that taking the mean out of consideration will
-  tend to give us a better factorization.  This is all a litte bit ad-hoc.
-  It would be cleaner to formulate this whole thing as learning a factored
-  representation of the inverse Fisher matrix, but that would become
-  very complicated, so we just estimate the diagonal in this rather ad-hoc
-  way and then do the low-rank factorization of the Fisher matrix after
-  the diagonal preconditioning.
- */
-
 
 class OnlineNaturalGradient {
  public:
@@ -471,11 +434,6 @@ class OnlineNaturalGradient {
   int32 GetRank() const { return rank_; }
   int32 GetUpdatePeriod() const { return update_period_; }
 
-  // search above for DIAGONAL_EXTENSION for explanations.  Value should
-  // be between 0 and 1.
-  void SetDiagonalPower(BaseFloat p) { diagonal_power_ = p; }
-  BaseFloat GetDiagonalPower() const { return diagonal_power_; }
-
   // see comment where 'frozen_' is declared.
   inline void Freeze(bool frozen) { frozen_ = frozen; }
 
@@ -521,16 +479,6 @@ class OnlineNaturalGradient {
                                       CuMatrixBase<BaseFloat> *WJKL_t,
                                       CuMatrixBase<BaseFloat> *X_t);
 
-  // This function is called from PreconditionDirections(), only if
-  // diagonal_power_ != 0.0 (see comment starting DIAGONAL_EXTENSION above).
-  // It takes care of the diagonal factors in the Fisher-matrix estimate
-  // and recurses to PreconditionDirectionsInternal().
-  void PreconditionDirectionsDiagonal(const BaseFloat rho_t,
-                                      bool updating,
-                                      const Vector<BaseFloat> &d_t,
-                                      CuMatrixBase<BaseFloat> *WJKL_t,
-                                      CuMatrixBase<BaseFloat> *X_t);
-
 
   // Works out from t_ and various class variables whether we will update
   // the parameters on this iteration (returns true if so).
@@ -597,29 +545,6 @@ class OnlineNaturalGradient {
   // properties.
   void SelfTest() const;
 
-
-  // This function, called only if diagonal_power_ != 0.0 (see
-  // DIAGONAL_EXTENSION comment), initializes diagonal_mean_, diagonal_uvar_ and
-  // diagonal_scale_, with stats from this minibatch (X is the vectors before
-  // preconditioning, one vector per row).
-  void InitDiagonalParams(const CuMatrixBase<BaseFloat> &X);
-
-  // This function, called only if diagonal_power_ != 0.0 (see
-  // DIAGONAL_EXTENSION comment), sets diagonal_mean_new and diagonal_uvar_new to
-  // updated versions of the diagonal stats in diagonal_mean_ and diagonal_uvar_:
-  // changed by scaling down the old stats and then adding in stats from 'X'.
-  // 'X' is the vectors (one per row) that are doing to multiplied by our
-  // natural gradient matrix.  The provided pointers will be pointers to
-  // temporaries that will later be copied to class members.
-  void UpdateDiagonalStats(const CuMatrixBase<BaseFloat> &X,
-                           CuVectorBase<BaseFloat> *diagonal_mean_new,
-                           CuVectorBase<BaseFloat> *diagonal_uvar_new);
-
-  // This function updates diagonal_scale_ from the stats in
-  // diagonal_mean_ and diagonal_uvar_.
-  void UpdateDiagonalScale();
-
-
   // Configuration values:
 
   // The rank of the correction to the unit matrix (e.g. 20).
@@ -685,43 +610,6 @@ class OnlineNaturalGradient {
   CuMatrix<BaseFloat> W_t_;
   BaseFloat rho_t_;
   Vector<BaseFloat> d_t_;
-
-  // Things below this point relate to 'diagonal' preconditioning.
-  // Search above for DIAGONAL_EXTENSION for an in-depth explanation.
-
-  // The diagonal extension is turned off by default (diagonal_power_ == 0.0),
-  // but you can turn it on by setting diagonal_power_ (probably to some
-  // positive value not greater than 1, with 1 corresponding to natural
-  // gradient, and 0.5 corresponding to something more like Adagrad).
-  BaseFloat diagonal_power_;
-
-  // diagonal_epsilon_ (e.g. 0.001) is a floor on the diagonal elements of the
-  // variances; this is expressed relative to the average un-floored variance
-  // over all dimensions (since dynamic ranges differ considerably).
-  BaseFloat diagonal_epsilon_;
-
-  //   dim_ is not a real variable but it is useful for explaining some things
-  //   we're doing below.  It's the dimension of the vectors we're preconditioning:
-  //   D in the math and the paper.  Is is the same as W_t_.NumCols().
-  // int32 dim_;
-
-  // diagonal_mean_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
-  // moving-average mean of the vectors we're preconditioning.
-  CuVector<BaseFloat> diagonal_mean_;
-
-  // diagonal_xuvar_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
-  // decaying average over minibatches of the (diagonal) uncentered variance of
-  // the input vectors we're preconditioning.
-  CuVector<BaseFloat> diagonal_uvar_;
-
-  // diagonal_scale_, of dimension dim_ (or zero if diagonal_power_ == 0.0), is a
-  // vector of scaling factors which is the diagonal part of the inverse-Fisher
-  // matrix, applied before and after the scaled-unit-plus-low-rank part.
-  // It is the (floored and rescaled) variance estimated from the stats in
-  // diagonal_mean_ and diagonal_uvar_, taken to the power -0.5 * diagonal_power_.
-  CuVector<BaseFloat> diagonal_scale_;
-
-
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 7d84c2e9518..34d24a39f24 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2657,14 +2657,6 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &rank_in);
   ExpectToken(is, binary, "<RankOut>");
   ReadBasicType(is, binary, &rank_out);
-  if (PeekToken(is, binary) == 'D') {
-    ExpectToken(is, binary, "<DiagonalPowerInOut>");
-    BaseFloat d_in, d_out;
-    ReadBasicType(is, binary, &d_in);
-    ReadBasicType(is, binary, &d_out);
-    preconditioner_in_.SetDiagonalPower(d_in);
-    preconditioner_out_.SetDiagonalPower(d_out);
-  }
   ExpectToken(is, binary, "<UpdatePeriod>");
   ReadBasicType(is, binary, &update_period);
   ExpectToken(is, binary, "<NumSamplesHistory>");
@@ -2773,9 +2765,7 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
 
   // Set natural-gradient configs.
   BaseFloat num_samples_history = 2000.0,
-      alpha = 4.0,
-      diagonal_power_in = 0.0,
-      diagonal_power_out = 0.0;
+      alpha = 4.0;
   int32 rank_in = 20, rank_out = 80,
       update_period = 4;
   cfl->GetValue("num-samples-history", &num_samples_history);
@@ -2783,8 +2773,6 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("rank-in", &rank_in);
   cfl->GetValue("rank-out", &rank_out);
   cfl->GetValue("update-period", &update_period);
-  cfl->GetValue("diagonal-power-in", &diagonal_power_in);
-  cfl->GetValue("diagonal-power-out", &diagonal_power_out);
 
   preconditioner_in_.SetNumSamplesHistory(num_samples_history);
   preconditioner_out_.SetNumSamplesHistory(num_samples_history);
@@ -2793,8 +2781,6 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
   preconditioner_in_.SetRank(rank_in);
   preconditioner_out_.SetRank(rank_out);
   preconditioner_out_.SetUpdatePeriod(update_period);
-  preconditioner_in_.SetDiagonalPower(diagonal_power_in);
-  preconditioner_out_.SetDiagonalPower(diagonal_power_out);
 
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
@@ -2814,13 +2800,6 @@ void NaturalGradientAffineComponent::Write(std::ostream &os,
   WriteBasicType(os, binary, preconditioner_in_.GetRank());
   WriteToken(os, binary, "<RankOut>");
   WriteBasicType(os, binary, preconditioner_out_.GetRank());
-  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
-      d_out = preconditioner_out_.GetDiagonalPower();
-  if (d_in != 0.0 || d_out != 0.0) {
-    WriteToken(os, binary, "<DiagonalPowerInOut>");
-    WriteBasicType(os, binary, d_in);
-    WriteBasicType(os, binary, d_out);
-  }
   WriteToken(os, binary, "<UpdatePeriod>");
   WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod());
   WriteToken(os, binary, "<NumSamplesHistory>");
@@ -2838,12 +2817,6 @@ std::string NaturalGradientAffineComponent::Info() const {
          << ", num-samples-history=" << preconditioner_in_.GetNumSamplesHistory()
          << ", update-period=" << preconditioner_in_.GetUpdatePeriod()
          << ", alpha=" << preconditioner_in_.GetAlpha();
-  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
-      d_out = preconditioner_out_.GetDiagonalPower();
-  if (d_in != 0.0 || d_out != 0.0) {
-    stream << ", diagonal-power-in=" << d_in
-           << ", diagonal-power-out=" << d_out;
-  }
   return stream.str();
 }
 
@@ -2948,14 +2921,6 @@ void LinearComponent::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "<RankInOut>");
   ReadBasicType(is, binary, &rank_in);
   ReadBasicType(is, binary, &rank_out);
-  if (PeekToken(is, binary) == 'D') {
-    ExpectToken(is, binary, "<DiagonalPowerInOut>");
-    BaseFloat d_in, d_out;
-    ReadBasicType(is, binary, &d_in);
-    ReadBasicType(is, binary, &d_out);
-    preconditioner_in_.SetDiagonalPower(d_in);
-    preconditioner_out_.SetDiagonalPower(d_out);
-  }
   ExpectToken(is, binary, "<Alpha>");
   ReadBasicType(is, binary, &alpha);
   ExpectToken(is, binary, "<NumSamplesHistory>");
@@ -3007,9 +2972,7 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   // Read various natural-gradient-related configs.
   int32 rank_in = 20, rank_out = 80, update_period = 4;
   BaseFloat alpha = 4.0,
-      num_samples_history = 2000.0,
-      diagonal_power_in = 0.0,
-      diagonal_power_out = 0.0;
+      num_samples_history = 2000.0;
 
   use_natural_gradient_ = true;
 
@@ -3019,9 +2982,6 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   cfl->GetValue("rank-out", &rank_out);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("use-natural-gradient", &use_natural_gradient_);
-  cfl->GetValue("diagonal-power-in", &diagonal_power_in);
-  cfl->GetValue("diagonal-power-out", &diagonal_power_out);
-
 
   preconditioner_in_.SetAlpha(alpha);
   preconditioner_out_.SetAlpha(alpha);
@@ -3031,8 +2991,6 @@ void LinearComponent::InitFromConfig(ConfigLine *cfl) {
   preconditioner_out_.SetNumSamplesHistory(num_samples_history);
   preconditioner_in_.SetUpdatePeriod(update_period);
   preconditioner_out_.SetUpdatePeriod(update_period);
-  preconditioner_in_.SetDiagonalPower(diagonal_power_in);
-  preconditioner_out_.SetDiagonalPower(diagonal_power_out);
 
   orthonormal_constraint_ = 0.0;
   cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
@@ -3059,17 +3017,10 @@ void LinearComponent::Write(std::ostream &os,
       rank_out = preconditioner_out_.GetRank(),
       update_period = preconditioner_in_.GetUpdatePeriod();
   BaseFloat alpha = preconditioner_in_.GetAlpha(),
-      num_samples_history = preconditioner_in_.GetNumSamplesHistory(),
-      d_in = preconditioner_in_.GetDiagonalPower(),
-      d_out = preconditioner_out_.GetDiagonalPower();
+      num_samples_history = preconditioner_in_.GetNumSamplesHistory();
   WriteToken(os, binary, "<RankInOut>");
   WriteBasicType(os, binary, rank_in);
   WriteBasicType(os, binary, rank_out);
-  if (d_in != 0.0 || d_out != 0.0) {
-    WriteToken(os, binary, "<DiagonalPowerInOut>");
-    WriteBasicType(os, binary, d_in);
-    WriteBasicType(os, binary, d_out);
-  }
   WriteToken(os, binary, "<Alpha>");
   WriteBasicType(os, binary, alpha);
   WriteToken(os, binary, "<NumSamplesHistory>");
@@ -3089,12 +3040,6 @@ std::string LinearComponent::Info() const {
                       GetVerboseLevel() >= 2); // include_singular_values
   if (orthonormal_constraint_ != 0.0)
     stream << ", orthonormal-constraint=" << orthonormal_constraint_;
-  BaseFloat d_in = preconditioner_in_.GetDiagonalPower(),
-      d_out = preconditioner_out_.GetDiagonalPower();
-  if (d_in != 0.0 || d_out != 0.0) {
-    stream << ", diagonal-power-in=" << d_in
-           << ", diagonal-power-out=" << d_out;
-  }
   stream << ", use-natural-gradient="
          << (use_natural_gradient_ ? "true" : "false")
          << ", rank-in=" << preconditioner_in_.GetRank()
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 2432c912e75..f596ec6be75 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -773,13 +773,6 @@ class LogSoftmaxComponent: public NonlinearComponent {
                             minibatches) we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
-      diagonal-power-in, diagonal-power-out
-                            Control a diagonal factor in the natural gradient
-                            factorization, for the input and output spaces
-                            respectively 0.0 = default (old-style natural
-                            gradient), 1.0 = natural gradient with the diagonal
-                            factors; 0.5 is more like a factorized type of
-                            adagrad.
 */
 class NaturalGradientAffineComponent: public AffineComponent {
  public:
@@ -805,17 +798,6 @@ class NaturalGradientAffineComponent: public AffineComponent {
   NaturalGradientAffineComponent &operator= (
       const NaturalGradientAffineComponent&);
 
-  // Configs for preconditioner.  The input side tends to be better conditioned ->
-  // smaller rank needed, so make them separately configurable.
-  int32 rank_in_;
-  int32 rank_out_;
-  int32 update_period_;
-  BaseFloat num_samples_history_;
-  BaseFloat alpha_;
-  // note: the config values diagonal-power-in and diagonal-power-out
-  // are stored in the objects preconditioner_in_ and preconditioner_out_
-  // directly.
-
   OnlineNaturalGradient preconditioner_in_;
 
   OnlineNaturalGradient preconditioner_out_;
@@ -883,13 +865,6 @@ class NaturalGradientAffineComponent: public AffineComponent {
                             minibatches) we update the Fisher-matrix estimates;
                             making this > 1 saves a little time in training.
                             default=4.
-      diagonal-power-in, diagonal-power-out
-                            Control a diagonal factor in the natural gradient
-                            factorization, for the input and output spaces
-                            respectively 0.0 = default (old-style natural
-                            gradient), 1.0 = natural gradient with the diagonal
-                            factors; 0.5 is more like a factorized type of
-                            adagrad.
 */
 class LinearComponent: public UpdatableComponent {
  public:

From 49689a6fbfc20053e861056191b5b44465250bc7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 30 Dec 2017 21:13:25 -0800
Subject: [PATCH 041/184] [scripts] Clarify documentation; remove unused
 feature.

---
 .../libs/nnet3/train/chain_objf/acoustic_model.py |  9 +--------
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py       | 15 +++++++++++----
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 798f6087a51..1ec9a09b571 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -173,19 +173,13 @@ def train_new_models(dir, iter, srand, num_jobs,
                          (" --write-cache={0}/cache.{1}".format(dir, iter + 1)
                           if job == 1 else ""))
 
-        # For the first epoch (at most the first 15 iters), scale the batchnorm stats
-        # down more aggressively.  This affects memory-norm components.
-        batchnorm_opt=("--batchnorm-stats-scale=0.5"
-                       if num_archives_processed < (num_archives * frame_subsampling_factor) and iter < 15
-                       else "")
-
         thread = common_lib.background_command(
             """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \
                     nnet3-chain-train {parallel_train_opts} {verbose_opt} \
                     --apply-deriv-weights={app_deriv_wts} \
                     --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
                     {cache_io_opts}  --xent-regularize={xent_reg} \
-                    {deriv_time_opts} {batchnorm_opt} \
+                    {deriv_time_opts} \
                     --print-interval=10 --momentum={momentum} \
                     --max-param-change={max_param_change} \
                     --backstitch-training-scale={backstitch_training_scale} \
@@ -205,7 +199,6 @@ def train_new_models(dir, iter, srand, num_jobs,
                         dir=dir, iter=iter, srand=iter + srand,
                         next_iter=iter + 1, job=job,
                         deriv_time_opts=" ".join(deriv_time_opts),
-                        batchnorm_opt=batchnorm_opt,
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index d226a891113..e66d38a3dc4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -778,10 +778,17 @@ def _generate_lstm_config(self):
 # This class is for lines like
 #   'lstmb-layer name=lstm1 input=[-1] delay=-3'
 #
-# TODO: more description
-# It's like fast-lstm-layer but with a bottleneck (like an SVD) in the main parameter matrix
-# of the LSTM (W_all, which combines all the full-rank projections of the LSTM): we divide
-# it into two matrices, with an orbatch-norm in between to stabilize the training.
+# LSTMB is not something we've published; it's LSTM with a bottleneck in the
+# middle of the W_all matrix (where W_all is a matrix that combines the 8 full
+# matrices of standard LSTM).  W_all is factored into W_all_a and W_all_b, where
+# W_all_a is constrained to have orthonormal rows (this keeps it training stably).
+#
+# It also contains a couple of other improvements: W_all_b is followed by
+# trainable ScaleAndOffsetComponent (this is a bit like the idea from the
+# publication "Self-stabilized deep neural network" by Ghahramani et al).
+# And the LSTM is followed by a batchnorm component (this is by default; it's not
+# part of the layer name, like lstmb-batchnorm-layer).
+
 #
 # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified,
 # the dimension defaults to the same as the input.

From c362b2cf4bfbe51282645a472f04a76e8ee475f5 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 31 Dec 2017 19:35:15 -0800
Subject: [PATCH 042/184] [src] Add more diagnostics for NonlinearComponent

---
 src/nnet3/nnet-component-itf.cc | 36 ++++++++++++++++++++++++++++-----
 src/nnet3/nnet-component-itf.h  |  5 +++++
 src/nnet3/nnet-parse.cc         |  5 +++++
 src/nnet3/nnet-parse.h          |  2 ++
 4 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 8a52b7b788c..d0319403b10 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -393,11 +393,16 @@ std::string NonlinearComponent::Info() const {
     value_avg.Scale(1.0 / count_);
     stream << ", value-avg=" << SummarizeVector(value_avg);
     if (deriv_sum_.Dim() == dim_) {
-      Vector<double> deriv_avg_dbl(deriv_sum_);
-      Vector<BaseFloat> deriv_avg(deriv_avg_dbl);
+      Vector<double> deriv_avg(deriv_sum_);
       deriv_avg.Scale(1.0 / count_);
       stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
     }
+    if (oderiv_sumsq_.Dim() == dim_) {
+      Vector<double> oderiv_rms(oderiv_sumsq_);
+      oderiv_rms.Scale(1.0 / count_);
+      oderiv_rms.ApplyPow(0.5);
+      stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms);
+    }
   }
   return stream.str();
 }
@@ -405,6 +410,7 @@ std::string NonlinearComponent::Info() const {
 void NonlinearComponent::Scale(BaseFloat scale) {
   value_sum_.Scale(scale);
   deriv_sum_.Scale(scale);
+  oderiv_sumsq_.Scale(scale);
   count_ *= scale;
   num_dims_self_repaired_ *= scale;
   num_dims_processed_ *= scale;
@@ -418,10 +424,14 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) {
     value_sum_.Resize(other->value_sum_.Dim());
   if (deriv_sum_.Dim() == 0 && other->deriv_sum_.Dim() != 0)
     deriv_sum_.Resize(other->deriv_sum_.Dim());
+  if (oderiv_sumsq_.Dim() == 0 && other->oderiv_sumsq_.Dim() != 0)
+    oderiv_sumsq_.Resize(other->oderiv_sumsq_.Dim());
   if (other->value_sum_.Dim() != 0)
     value_sum_.AddVec(alpha, other->value_sum_);
   if (other->deriv_sum_.Dim() != 0)
     deriv_sum_.AddVec(alpha, other->deriv_sum_);
+  if (other->oderiv_sumsq_.Dim() != 0)
+    oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_);
   count_ += alpha * other->count_;
   num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_;
   num_dims_processed_ += alpha * other->num_dims_processed_;
@@ -443,10 +453,18 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
   value_sum_.Read(is, binary);
   ExpectToken(is, binary, "<DerivAvg>");
   deriv_sum_.Read(is, binary);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OderivRms>");
+    oderiv_sumsq_.Read(is, binary);
+    oderiv_sumsq_.ApplyPow(2.0);
+  } else {
+    oderiv_sumsq_.Resize(deriv_sum_.Dim());
+  }
   ExpectToken(is, binary, "<Count>");
   ReadBasicType(is, binary, &count_);
   value_sum_.Scale(count_);
   deriv_sum_.Scale(count_);
+  oderiv_sumsq_.Scale(count_);
 
   std::string token;
   ReadToken(is, binary, &token);
@@ -493,12 +511,20 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   Vector<BaseFloat> temp(value_sum_);
   if (count_ != 0.0) temp.Scale(1.0 / count_);
   temp.Write(os, binary);
-  WriteToken(os, binary, "<DerivAvg>");
 
-  temp.Resize(deriv_sum_.Dim(), kUndefined);
+  WriteToken(os, binary, "<DerivAvg>");
+  temp.Resize(deriv_sum_.Dim());
   temp.CopyFromVec(deriv_sum_);
   if (count_ != 0.0) temp.Scale(1.0 / count_);
   temp.Write(os, binary);
+
+  WriteToken(os, binary, "<OderivRms>");
+  temp.Resize(oderiv_sumsq_.Dim());
+  temp.CopyFromVec(oderiv_sumsq_);
+  if (count_ != 0.0) temp.Scale(1.0 / count_);
+  temp.ApplyPow(0.5);
+  temp.Write(os, binary);
+
   WriteToken(os, binary, "<Count>");
   WriteBasicType(os, binary, count_);
   WriteToken(os, binary, "<NumDimsSelfRepaired>");
@@ -530,7 +556,7 @@ NonlinearComponent::NonlinearComponent():
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), block_dim_(other.block_dim_),
     value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    count_(other.count_),
+    oderiv_sumsq_(other.oderiv_sumsq_), count_(other.count_),
     num_dims_self_repaired_(other.num_dims_self_repaired_),
     num_dims_processed_(other.num_dims_processed_),
     self_repair_lower_threshold_(other.self_repair_lower_threshold_),
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 565a7f25e74..c096a78325b 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -658,6 +658,11 @@ class NonlinearComponent: public Component {
   CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity
                                // (only applicable to element-by-element
                                // nonlinearities, not Softmax.
+  CuVector<double> oderiv_sumsq_;  // Sum-square of the derivative of the
+                                   // objective function, that we're propagating
+                                   // back.  Accumulated during the backprop;
+                                   // used for diagnostics.
+
   double count_;
 
   // some stats for self-repairing nonlinearities.
diff --git a/src/nnet3/nnet-parse.cc b/src/nnet3/nnet-parse.cc
index 52d47876c8a..bb3a209460a 100644
--- a/src/nnet3/nnet-parse.cc
+++ b/src/nnet3/nnet-parse.cc
@@ -517,6 +517,11 @@ std::string SummarizeVector(const VectorBase<BaseFloat> &vec) {
   return os.str();
 }
 
+std::string SummarizeVector(const VectorBase<double> &vec) {
+  Vector<BaseFloat> vec_copy(vec);
+  return SummarizeVector(vec_copy);
+}
+
 std::string SummarizeVector(const CuVectorBase<BaseFloat> &cu_vec) {
   Vector<BaseFloat> vec(cu_vec);
   return SummarizeVector(vec);
diff --git a/src/nnet3/nnet-parse.h b/src/nnet3/nnet-parse.h
index 5cfe080e422..0b2e0041aaa 100644
--- a/src/nnet3/nnet-parse.h
+++ b/src/nnet3/nnet-parse.h
@@ -196,6 +196,8 @@ std::string ErrorContext(const std::string &str);
 */
 std::string SummarizeVector(const VectorBase<BaseFloat> &vec);
 
+std::string SummarizeVector(const VectorBase<double> &vec);
+
 std::string SummarizeVector(const CuVectorBase<BaseFloat> &vec);
 
 /** Print to 'os' some information about the mean and standard deviation of

From b5ad6ec6ef1f7feded82115c922431e9dee0355b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 1 Jan 2018 01:55:02 -0500
Subject: [PATCH 043/184] [src] nnet3 bug fixes RE oderiv-rms stats

---
 .../s5/steps/libs/nnet3/report/log_parse.py   |  4 +-
 .../nnet3/train/chain_objf/acoustic_model.py  |  4 +-
 egs/wsj/s5/steps/libs/nnet3/train/common.py   |  2 +-
 .../nnet3/train/frame_level_objf/common.py    | 10 ++-
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 56 ++++++++-----
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |  3 +-
 .../libs/nnet3/xconfig/trivial_layers.py      | 53 ++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py  | 50 ++++++++++-
 egs/wsj/s5/steps/nnet3/chain/train.py         |  5 +-
 src/cudamatrix/cu-allocator.h                 |  2 +-
 src/nnet3/nnet-component-itf.cc               | 83 ++++++++++++-------
 src/nnet3/nnet-component-itf.h                | 16 ++--
 src/nnet3/nnet-component-test.cc              |  6 +-
 13 files changed, 215 insertions(+), 79 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
index d5f2575d582..905edc1a78b 100755
--- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
+++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py
@@ -388,8 +388,8 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"):
                 " key {k} in both {tl} and {vl}".format(
                     k=key, tl=train_prob_files, vl=valid_prob_files))
     iters.sort()
-    return map(lambda x: (int(x), float(train_objf[x]),
-                          float(valid_objf[x])), iters)
+    return list(map(lambda x: (int(x), float(train_objf[x]),
+                               float(valid_objf[x])), iters))
 
 
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 1ec9a09b571..c63901367d6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -387,8 +387,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                         range(1, num_lda_jobs + 1))
+    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                              range(1, num_lda_jobs + 1)))
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 2b4fdd92cec..6c194a2c0a1 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -531,7 +531,7 @@ def smooth_presoftmax_prior_scale_vector(pdf_counts,
         scales.append(math.pow(pdf_counts[i] + smooth * average_count,
                                presoftmax_prior_scale_power))
     num_pdfs = len(pdf_counts)
-    scaled_counts = map(lambda x: x * float(num_pdfs) / sum(scales), scales)
+    scaled_counts = list(map(lambda x: x * float(num_pdfs) / sum(scales), scales))
     return scaled_counts
 
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index aa100e6af91..9f9b5752ce6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -9,6 +9,8 @@
 network without transition model) with frame-level objectives.
 """
 
+from __future__ import print_statement
+from __future__ import division
 import glob
 import logging
 import math
@@ -91,7 +93,7 @@ def train_new_models(dir, iter, srand, num_jobs,
         archive_index = (k % num_archives) + 1
 
         if not chunk_level_training:
-            frame = (k / num_archives + archive_index) % frames_per_eg
+            frame = (k // num_archives + archive_index) % frames_per_eg
 
         cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir,
                                                                   iter=iter)
@@ -344,8 +346,8 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts,
                     rand_prune=rand_prune))
 
     # the above command would have generated dir/{1..num_lda_jobs}.lda_stats
-    lda_stat_files = map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
-                         range(1, num_lda_jobs + 1))
+    lda_stat_files = list(map(lambda x: '{0}/{1}.lda_stats'.format(dir, x),
+                              range(1, num_lda_jobs + 1)))
 
     common_lib.execute_command(
         """{command} {dir}/log/sum_transform_stats.log \
@@ -576,7 +578,7 @@ def get_realign_iters(realign_times, num_iters,
                                      + realign_time * math.pow(num_jobs_final,
                                                                2))
             realign_iter = realign_iter - num_jobs_initial
-            realign_iter = realign_iter / (num_jobs_final - num_jobs_initial)
+            realign_iter = realign_iter // (num_jobs_final - num_jobs_initial)
             realign_iter = realign_iter * num_iters
         realign_iters.append(int(realign_iter))
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index e66d38a3dc4..85454795435 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -609,9 +609,10 @@ def set_default_configs(self):
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        # recurrence-scale is a scale we put on the c_t when doing linear projections
-                        # from it... making it larger than 1 (e.g. 4) helps equalize scales.
-                        'recurrence-scale': 1.0,
+                        # self-scale is a scale we put on the m_t when doing
+                        # linear projections from it... making it larger than 1
+                        # (e.g. 4) helps equalize scales.
+                        'self-scale': 1.0,
                         'delay' : -1,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
@@ -748,7 +749,7 @@ def _generate_lstm_config(self):
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                        "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
-                          name, input_descriptor, self.config['recurrence-scale'], delay))
+                          name, input_descriptor, self.config['self-scale'], delay))
         if self.config['self-stabilize']:
             configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
             W_all_name = 'W_all_so'
@@ -825,18 +826,23 @@ def set_default_configs(self):
         self.config = { 'input':'[-1]',
                         'cell-dim' : -1, # this is a required argument
                         'bottleneck-dim': -1, # this is a required argument
-                        'clipping-threshold' : 30.0,
-                        'zeroing-interval' : 20,
-                        'zeroing-threshold' : 15.0,
+                        'clipping-threshold': 30.0,
+                        'zeroing-interval': 20,
+                        'zeroing-threshold': 15.0,
+                        # batchnorm-power is for what i'm going to call OverNorm, you can set it
+                        # for example to -0.75.
+                        'batchnorm-power': -0.5,
                         'delay' : -1,
                         'lstm-nonlinearity-options' : ' max-change=0.75',
+                        # the recurrence scale is the scale on m_trunc, used in the
+                        # recurrence (to balance its size with the input).
+                        'self-scale' : 1.0,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
                         'l2-regularize': 0.0,
                         'decay-time':  -1.0
                         }
-        self.c_needed = False  # keep track of whether the 'c' output is needed.
 
     def set_derived_configs(self):
         if self.config['cell-dim'] <= 0:
@@ -884,6 +890,7 @@ def _generate_lstm_config(self):
         input_descriptor = self.descriptors['input']['final-string']
         cell_dim = self.config['cell-dim']
         bottleneck_dim = self.config['bottleneck-dim']
+        self_scale = self.config['self-scale']
         delay = self.config['delay']
         affine_str = self.config['ng-affine-options']
         l2_regularize = self.config['l2-regularize']
@@ -917,7 +924,7 @@ def _generate_lstm_config(self):
         # constraint, it's meaningless.
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
-                       "orthonormal-constraint=1.0 output-dim={2} {3} ".format(
+                       "orthonormal-constraint=1.0 output-dim={2} {3}".format(
                            name, input_dim + cell_dim, bottleneck_dim,
                            affine_str))
 
@@ -936,16 +943,15 @@ def _generate_lstm_config(self):
                                                      l2_regularize_option))
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
 
-        configs.append("component name={0}.c_trunc type=BackpropTruncationComponent dim={1} {2}".format(
-            name, cell_dim, bptrunc_str))
-        configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
-            name, cell_dim))
-
+        configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
+            name, 2 * cell_dim, bptrunc_str))
+        configs.append("component name={0}.m_batchnorm type=BatchNormComponent power={1} dim={2} ".format(
+            name, self.config['batchnorm-power'], cell_dim))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
-                       "IfDefined(Offset({0}.c_trunc, {2})))".format(
-                           name, input_descriptor, delay))
+                       "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format(
+                           name, input_descriptor, self_scale, delay))
         configs.append("component-node name={0}.W_all_b component={0}.W_all_b "
                        "input={0}.W_all_a".format(name))
         configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so "
@@ -954,11 +960,13 @@ def _generate_lstm_config(self):
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
                        "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format(
                            name, delay))
-        configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 "
-                       "dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} "
                        "dim={1}".format(name, cell_dim))
-        configs.append("component-node name={0}.c_trunc component={0}.c_trunc input={0}.c".format(name))
+        configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
+        configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 "
+                       "dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} "
+                       "dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm "
                        "input={0}.m".format(name))
         configs.append("### End LTSM layer '{0}'".format(name))
@@ -1151,15 +1159,17 @@ def _generate_lstm_config(self):
         if self.config['self-stabilize']:
             # have LinearComponent followed by ScaleAndOffsetComponent.
             configs.append("component name={0}.W_all type=LinearComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
+                           "output-dim={2} {3} {4} ".format(
+                               name, input_dim + rec_proj_dim, cell_dim * 4,
+                               affine_str, l2_regularize_option))
             configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} "
                            "max-change=0.75".format(name, cell_dim * 4))
         else:
             # have NaturalGradientAffineComponent
             configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
+                           "output-dim={2} {3} {4}".format(
+                               name, input_dim + rec_proj_dim, cell_dim * 4,
+                               affine_str, l2_regularize_option))
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 71205961681..c6b0619bca8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -68,7 +68,8 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
-        'renorm-component': xlayers.XconfigRenormComponent
+        'renorm-component': xlayers.XconfigRenormComponent,
+        'no-op-component': xlayers.XconfigNoOpComponent
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 80a2b7df418..ef05887e469 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -68,3 +68,56 @@ def _generate_config(self):
             self.name, input_desc))
         configs.append(line)
         return configs
+
+
+class XconfigNoOpComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'no-op-component name=renorm input=Append(-3,0,3)'
+    which will produce just a single component, of type NoOpComponent.
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]' }
+
+    def check_configs(self):
+        pass
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        input_dim = self.descriptors['input']['dim']
+        return input_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+
+        configs = []
+        line = ('component name={0} type=NoOpComponent dim={1}'.format(
+            self.name, input_dim))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
index 9ff7f1e2258..08de18167cd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/utils.py
@@ -6,6 +6,7 @@
 # while xconfig_layers.py contains the code specific to layer types.
 
 from __future__ import print_function
+from __future__ import division
 import re
 import sys
 
@@ -277,6 +278,12 @@ def dim(self, layer_to_dim):
             return self.items[0].dim(layer_to_dim)
         elif self.operator == 'Append':
             return sum([ x.dim(layer_to_dim) for x in self.items])
+        elif self.operator == 'Scale':
+            # e.g. Scale(2.0, lstm1).  Return dim of 2nd arg.
+            return self.items[1].dim(layer_to_dim)
+        elif self.operator == 'Const':
+            # e.g. Const(0.5, 512).  Return 2nd arg, which is an int.
+            return self.items[1]
         else:
             raise RuntimeError("Unknown operator {0}".format(self.operator))
 
@@ -312,7 +319,8 @@ def parse_new_descriptor(tokens, pos, prev_names):
 
     # when reading this function, be careful to note the indent level,
     # there is an if-statement within an if-statement.
-    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]:
+    if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum',
+                        'Switch', 'Failover', 'IfDefined' ]:
         expect_token('(', tokens[pos], first_token + '()')
         pos += 1
         d.operator = first_token
@@ -392,6 +400,38 @@ def parse_new_descriptor(tokens, pos, prev_names):
             pos += 1
         else:
             raise RuntimeError("code error")
+    elif first_token in ['Scale', 'Const' ]:
+        # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)'
+        expect_token('(', tokens[pos], first_token + '()')
+        pos += 1
+        d.operator = first_token
+        # First arg of Scale() and Const() is a float: the scale or value,
+        # respectively.
+        try:
+            value = float(tokens[pos])
+            pos += 1
+            d.items = [value]
+        except:
+            raise RuntimeError("Parsing {0}, expected float, got {1}".format(
+                first_token, tokens[pos]))
+        # Consume the comma.
+        expect_token(',', tokens[pos], first_token + '()')
+        pos += 1
+        if first_token == 'Scale':
+            # Second arg of Scale() is a Descriptor.
+            (desc, pos) = parse_new_descriptor(tokens, pos, prev_names)
+            d.items.append(desc)
+        else:
+            assert first_token == 'Const'
+            try:
+                dim = int(tokens[pos])
+                pos += 1
+                d.items.append(dim)
+            except:
+                raise RuntimeError("Parsing Const() expression, expected int, got {0}".format(
+                    tokens[pos]))
+        expect_token(')', tokens[pos], first_token)
+        pos += 1
     elif first_token in [ 'end of string', '(', ')', ',', '@' ]:
         raise RuntimeError("Expected descriptor, got " + first_token)
     elif is_valid_line_name(first_token) or first_token == '[':
@@ -555,7 +595,7 @@ def parse_config_line(orig_config_line):
 
     rest_of_line = ' '.join(fields)
     # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)'
-    positions = map(lambda x: x.start(), re.finditer('"', rest_of_line))
+    positions = list(map(lambda x: x.start(), re.finditer('"', rest_of_line)))
     if not len(positions) % 2 == 0:
         raise RuntimeError("Double-quotes should occur in pairs")
 
@@ -565,7 +605,7 @@ def parse_config_line(orig_config_line):
     # and replace the quotation marks themselves with spaces.
     # Then later on we'll convert all the question marks to
     # equals signs in the values in the dicts.
-    num_strings = len(positions) / 2
+    num_strings = len(positions) // 2
     fields = []
     for i in range(num_strings):
         start = positions[i * 2]
@@ -588,7 +628,7 @@ def parse_config_line(orig_config_line):
     if not (other_fields[0] == '' and len(other_fields) % 2 ==  1):
         raise RuntimeError("Could not parse config line.");
     fields += other_fields[1:]
-    num_variables = len(fields) / 2
+    num_variables = len(fields) // 2
     for i in range(num_variables):
         var_name = fields[i * 2]
         var_value = fields[i * 2 + 1]
@@ -634,6 +674,8 @@ def test_library():
                   ('Append(-3,0,3)',
                    'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'),
                   ('[-1]', 'prev_layer'),
+                  ('Scale(2.0,foo)', 'Scale(2.0, foo)'),
+                  ('Const(0.5,500)', 'Const(0.5, 500)'),
                   ('[-2]', 'last_but_one_layer'),
                   ('[-2]@3',
                    'Offset(last_but_one_layer, 3)') ]:
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 7f607abd8dc..e52d2ecee20 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -522,7 +522,7 @@ def train(args, run_opts):
                 backstitch_training_interval=args.backstitch_training_interval)
 
             if args.cleanup:
-                # do a clean up everythin but the last 2 models, under certain
+                # do a clean up everything but the last 2 models, under certain
                 # conditions
                 common_train_lib.remove_model(
                     args.dir, iter-2, num_iters, models_to_combine,
@@ -573,8 +573,9 @@ def train(args, run_opts):
             # delete it
             remove_egs = False
 
+        # leave the last-two-numbered models, for diagnostic reasons.
         common_train_lib.clean_nnet_dir(
-            args.dir, num_iters, egs_dir,
+            args.dir, num_iters - 1, egs_dir,
             preserve_model_interval=args.preserve_model_interval,
             remove_egs=remove_egs)
 
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index f2ccf0d6c29..0f96315e848 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -54,7 +54,7 @@ struct CuAllocatorOptions {
   // is a constant overhead proportional to the number of buckets.
   BaseFloat delete_factor;
 
-  CuAllocatorOptions(): memory_factor(1.5),
+  CuAllocatorOptions(): memory_factor(1.3),
                         delete_factor(0.001) { }
 
   void Check() {
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index d0319403b10..e2a316835fd 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -332,24 +332,23 @@ std::string UpdatableComponent::Info() const {
 void NonlinearComponent::StoreStatsInternal(
     const CuMatrixBase<BaseFloat> &out_value,
     const CuMatrixBase<BaseFloat> *deriv) {
-  KALDI_ASSERT(out_value.NumCols() == InputDim());
+  KALDI_ASSERT(out_value.NumCols() == dim_);
 
   // Check we have the correct dimensions.
-  if (value_sum_.Dim() != InputDim() ||
-      (deriv != NULL && deriv_sum_.Dim() != InputDim())) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    if (value_sum_.Dim() != InputDim()) {
-      value_sum_.Resize(InputDim());
+  if (value_sum_.Dim() != dim_ ||
+      (deriv != NULL && deriv_sum_.Dim() != dim_)) {
+    if (value_sum_.Dim() != dim_) {
+      value_sum_.Resize(dim_);
       count_ = 0.0;
     }
-    if (deriv != NULL && deriv_sum_.Dim() != InputDim()) {
-      deriv_sum_.Resize(InputDim());
+    if (deriv != NULL && deriv_sum_.Dim() != dim_) {
+      deriv_sum_.Resize(dim_);
       count_ = 0.0;
       value_sum_.SetZero();
     }
   }
   count_ += out_value.NumRows();
-  CuVector<BaseFloat> temp(InputDim());
+  CuVector<BaseFloat> temp(dim_);
   temp.AddRowSumMat(1.0, out_value, 0.0);
   value_sum_.AddVec(1.0, temp);
   if (deriv != NULL) {
@@ -358,22 +357,35 @@ void NonlinearComponent::StoreStatsInternal(
   }
 }
 
+void NonlinearComponent::StoreBackpropStats(
+    const CuMatrixBase<BaseFloat> &out_deriv) {
+  KALDI_ASSERT(out_deriv.NumCols() == dim_);
+
+  // Check we have the correct dimensions.
+  if (oderiv_sumsq_.Dim() != dim_) {
+    oderiv_sumsq_.Resize(dim_);
+    oderiv_count_ = 0.0;
+  }
+  CuVector<BaseFloat> temp(dim_);
+  temp.AddDiagMat2(1.0, out_deriv, kTrans, 0.0);
+  oderiv_sumsq_.AddVec(1.0, temp);
+  oderiv_count_ += out_deriv.NumRows();
+}
+
+
 void NonlinearComponent::ZeroStats() {
   value_sum_.SetZero();
   deriv_sum_.SetZero();
+  oderiv_sumsq_.SetZero();
   count_ = 0.0;
+  oderiv_count_ = 0.0;
   num_dims_self_repaired_ = 0.0;
   num_dims_processed_ = 0.0;
 }
 
 std::string NonlinearComponent::Info() const {
   std::stringstream stream;
-  if (InputDim() == OutputDim()) {
-    stream << Type() << ", dim=" << InputDim();
-  } else {
-    stream << Type() << ", input-dim=" << InputDim()
-           << ", output-dim=" << OutputDim();
-  }
+  stream << Type() << ", dim=" << dim_;
   if (block_dim_ != dim_)
     stream << ", block-dim=" << block_dim_;
   if (self_repair_lower_threshold_ != BaseFloat(kUnsetThreshold))
@@ -397,12 +409,13 @@ std::string NonlinearComponent::Info() const {
       deriv_avg.Scale(1.0 / count_);
       stream << ", deriv-avg=" << SummarizeVector(deriv_avg);
     }
-    if (oderiv_sumsq_.Dim() == dim_) {
-      Vector<double> oderiv_rms(oderiv_sumsq_);
-      oderiv_rms.Scale(1.0 / count_);
-      oderiv_rms.ApplyPow(0.5);
-      stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms);
-    }
+  }
+  if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) {
+    Vector<double> oderiv_rms(oderiv_sumsq_);
+    oderiv_rms.Scale(1.0 / oderiv_count_);
+    oderiv_rms.ApplyPow(0.5);
+    stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms)
+           << ", oderiv-count=" << oderiv_count_;
   }
   return stream.str();
 }
@@ -412,6 +425,7 @@ void NonlinearComponent::Scale(BaseFloat scale) {
   deriv_sum_.Scale(scale);
   oderiv_sumsq_.Scale(scale);
   count_ *= scale;
+  oderiv_count_ *= scale;
   num_dims_self_repaired_ *= scale;
   num_dims_processed_ *= scale;
 }
@@ -433,6 +447,7 @@ void NonlinearComponent::Add(BaseFloat alpha, const Component &other_in) {
   if (other->oderiv_sumsq_.Dim() != 0)
     oderiv_sumsq_.AddVec(alpha, other->oderiv_sumsq_);
   count_ += alpha * other->count_;
+  oderiv_count_ += alpha * other->oderiv_count_;
   num_dims_self_repaired_ += alpha * other->num_dims_self_repaired_;
   num_dims_processed_ += alpha * other->num_dims_processed_;
 }
@@ -453,18 +468,21 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
   value_sum_.Read(is, binary);
   ExpectToken(is, binary, "<DerivAvg>");
   deriv_sum_.Read(is, binary);
+  ExpectToken(is, binary, "<Count>");
+  ReadBasicType(is, binary, &count_);
   if (PeekToken(is, binary) == 'O') {
     ExpectToken(is, binary, "<OderivRms>");
     oderiv_sumsq_.Read(is, binary);
     oderiv_sumsq_.ApplyPow(2.0);
+    ExpectToken(is, binary, "<OderivCount>");
+    ReadBasicType(is, binary, &oderiv_count_);
   } else {
-    oderiv_sumsq_.Resize(deriv_sum_.Dim());
+    oderiv_count_ = 0.0;
+    oderiv_sumsq_.Resize(0);
   }
-  ExpectToken(is, binary, "<Count>");
-  ReadBasicType(is, binary, &count_);
   value_sum_.Scale(count_);
   deriv_sum_.Scale(count_);
-  oderiv_sumsq_.Scale(count_);
+  oderiv_sumsq_.Scale(oderiv_count_);
 
   std::string token;
   ReadToken(is, binary, &token);
@@ -518,15 +536,19 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   if (count_ != 0.0) temp.Scale(1.0 / count_);
   temp.Write(os, binary);
 
+  WriteToken(os, binary, "<Count>");
+  WriteBasicType(os, binary, count_);
+
   WriteToken(os, binary, "<OderivRms>");
   temp.Resize(oderiv_sumsq_.Dim());
   temp.CopyFromVec(oderiv_sumsq_);
-  if (count_ != 0.0) temp.Scale(1.0 / count_);
+  if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_);
   temp.ApplyPow(0.5);
   temp.Write(os, binary);
 
-  WriteToken(os, binary, "<Count>");
-  WriteBasicType(os, binary, count_);
+  WriteToken(os, binary, "<OderivCount>");
+  WriteBasicType(os, binary, oderiv_count_);
+
   WriteToken(os, binary, "<NumDimsSelfRepaired>");
   WriteBasicType(os, binary, num_dims_self_repaired_);
   WriteToken(os, binary, "<NumDimsProcessed>");
@@ -547,7 +569,7 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
 }
 
 NonlinearComponent::NonlinearComponent():
-    dim_(-1), block_dim_(-1), count_(0.0),
+    dim_(-1), block_dim_(-1), count_(0.0), oderiv_count_(0.0),
     num_dims_self_repaired_(0.0), num_dims_processed_(0.0),
     self_repair_lower_threshold_(kUnsetThreshold),
     self_repair_upper_threshold_(kUnsetThreshold),
@@ -556,7 +578,8 @@ NonlinearComponent::NonlinearComponent():
 NonlinearComponent::NonlinearComponent(const NonlinearComponent &other):
     dim_(other.dim_), block_dim_(other.block_dim_),
     value_sum_(other.value_sum_), deriv_sum_(other.deriv_sum_),
-    oderiv_sumsq_(other.oderiv_sumsq_), count_(other.count_),
+    count_(other.count_), oderiv_sumsq_(other.oderiv_sumsq_),
+    oderiv_count_(other.oderiv_count_),
     num_dims_self_repaired_(other.num_dims_self_repaired_),
     num_dims_processed_(other.num_dims_processed_),
     self_repair_lower_threshold_(other.self_repair_lower_threshold_),
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index c096a78325b..c34d550d681 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -590,7 +590,7 @@ class UpdatableComponent: public Component {
 
        block-dim     Defaults to dim, but may be any nonzero divisor of dim.  It affects the
                      self-repair, which will be done while treating the input/output as
-                     repeating blocks of size 'block-dim' (e.g. blocks of filtes).  It allows
+                     repeating blocks of size 'block-dim' (e.g. blocks of filters).  It allows
                      us to do self-repair on the filter level in CNNs.
                      Currently this only makes a difference for RectifiedLinearComponent.
 */
@@ -643,6 +643,10 @@ class NonlinearComponent: public Component {
   void StoreStatsInternal(const CuMatrixBase<BaseFloat> &out_value,
                           const CuMatrixBase<BaseFloat> *deriv = NULL);
 
+  // This function may be called from child class members during backprop.  It
+  // stores the 'oderiv_sumsq_' stats.
+  void StoreBackpropStats(const CuMatrixBase<BaseFloat> &out_deriv);
+
 
   const NonlinearComponent &operator = (const NonlinearComponent &other); // Disallow.
 
@@ -658,12 +662,15 @@ class NonlinearComponent: public Component {
   CuVector<double> deriv_sum_; // stats of the derivative of the nonlinearity
                                // (only applicable to element-by-element
                                // nonlinearities, not Softmax.
+  // Count corresponding to the stats in 'value_sum_' and 'deriv_sum_'
+  double count_;
+
   CuVector<double> oderiv_sumsq_;  // Sum-square of the derivative of the
                                    // objective function, that we're propagating
                                    // back.  Accumulated during the backprop;
                                    // used for diagnostics.
-
-  double count_;
+  // Count corresponding to the stats in 'oderiv_sumsq_'.
+  double oderiv_count_;
 
   // some stats for self-repairing nonlinearities.
   double num_dims_self_repaired_;
@@ -673,9 +680,6 @@ class NonlinearComponent: public Component {
   BaseFloat self_repair_lower_threshold_;
   BaseFloat self_repair_upper_threshold_;
   BaseFloat self_repair_scale_;
-
-  // The mutex is used in UpdateStats, only for resizing vectors.
-  std::mutex mutex_;
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index 882ef112919..d7595378c1f 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -274,7 +274,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
 
   int32 input_dim = c.InputDim(),
       output_dim = c.OutputDim(),
-      num_rows = RandInt(1, 20),
+      num_rows = RandInt(1, 100),
       rand_seed = Rand();
   int32 properties = c.Properties();
   CuMatrix<BaseFloat> input_data(num_rows, input_dim, kSetZero, input_stride_type),
@@ -317,7 +317,7 @@ bool TestSimpleComponentDataDerivative(const Component &c,
   }
   KALDI_LOG << "Predicted objf-change = " << predicted_objf_change;
   KALDI_LOG << "Measured objf-change = " << measured_objf_change;
-  BaseFloat threshold = 0.05;
+  BaseFloat threshold = 0.1;
   bool ans = ApproxEqual(predicted_objf_change, measured_objf_change, threshold);
   if (!ans)
     KALDI_WARN << "Data-derivative test failed, component-type="
@@ -442,7 +442,7 @@ bool TestSimpleComponentModelDerivative(const Component &c,
 
 
 void UnitTestNnetComponent() {
-  for (int32 n = 0; n < 2000; n++)  {
+  for (int32 n = 0; n < 200; n++)  {
     Component *c = GenerateRandomSimpleComponent();
     KALDI_LOG << c->Info();
     TestNnetComponentIo(c);

From 9f362a0455e08a78b971856c468ec1e9e2615fc9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 Jan 2018 19:09:22 -0500
Subject: [PATCH 044/184] [src] Finish code for oderiv-rms stats.

---
 src/nnet3/nnet-component-itf.cc    | 10 ++++++++++
 src/nnet3/nnet-simple-component.cc | 26 ++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index e2a316835fd..0a82a592102 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -359,6 +359,10 @@ void NonlinearComponent::StoreStatsInternal(
 
 void NonlinearComponent::StoreBackpropStats(
     const CuMatrixBase<BaseFloat> &out_deriv) {
+  // only store these stats about every 4 minibatches.
+  if (RandInt(0, 3) == 0)
+    return;
+
   KALDI_ASSERT(out_deriv.NumCols() == dim_);
 
   // Check we have the correct dimensions.
@@ -413,6 +417,9 @@ std::string NonlinearComponent::Info() const {
   if (oderiv_count_ > 0 && oderiv_sumsq_.Dim() == dim_) {
     Vector<double> oderiv_rms(oderiv_sumsq_);
     oderiv_rms.Scale(1.0 / oderiv_count_);
+    // The ApplyMin() is so that the statement after it does not fail even if we
+    // had subtracted models (e.g. in full_progress.*.log).
+    oderiv_rms.ApplyFloor(0.0);
     oderiv_rms.ApplyPow(0.5);
     stream << ", oderiv-rms=" << SummarizeVector(oderiv_rms)
            << ", oderiv-count=" << oderiv_count_;
@@ -543,6 +550,9 @@ void NonlinearComponent::Write(std::ostream &os, bool binary) const {
   temp.Resize(oderiv_sumsq_.Dim());
   temp.CopyFromVec(oderiv_sumsq_);
   if (oderiv_count_ != 0.0) temp.Scale(1.0 / oderiv_count_);
+  // The ApplyMin() is so that the statement after it does not fail even if we
+  // had subtracted models (e.g. in full_progress.*.log).
+  temp.ApplyFloor(0.0);
   temp.ApplyPow(0.5);
   temp.Write(os, binary);
 
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 34d24a39f24..5bd6ffeee32 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -331,8 +331,10 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
   if (in_deriv != NULL) {
     in_deriv->DiffSigmoid(out_value, out_deriv);
     SigmoidComponent *to_update = dynamic_cast<SigmoidComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(out_value, in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -839,8 +841,10 @@ void TanhComponent::Backprop(const std::string &debug_info,
   if (in_deriv != NULL) {
     in_deriv->DiffTanh(out_value, out_deriv);
     TanhComponent *to_update = dynamic_cast<TanhComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(out_value, in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -889,8 +893,10 @@ void RectifiedLinearComponent::Backprop(
     in_deriv->MulElements(out_deriv);
     RectifiedLinearComponent *to_update =
         dynamic_cast<RectifiedLinearComponent*>(to_update_in);
-    if (to_update != NULL)
+    if (to_update != NULL) {
       RepairGradients(in_deriv, to_update);
+      to_update->StoreBackpropStats(out_deriv);
+    }
   }
 }
 
@@ -3404,6 +3410,13 @@ void SoftmaxComponent::Backprop(const std::string &debug_info,
                                 void *memo,
                                 Component *to_update_in,
                                 CuMatrixBase<BaseFloat> *in_deriv) const {
+
+  if (to_update_in) {
+    SoftmaxComponent *to_update =
+        dynamic_cast<SoftmaxComponent*>(to_update_in);
+    to_update->StoreBackpropStats(out_deriv);
+  }
+
   if (in_deriv == NULL)
     return;
   /*
@@ -3443,8 +3456,13 @@ void LogSoftmaxComponent::Backprop(const std::string &debug_info,
                                    const CuMatrixBase<BaseFloat> &out_value,
                                    const CuMatrixBase<BaseFloat> &out_deriv,
                                    void *memo,
-                                   Component *, // to_update
+                                   Component *to_update_in,
                                    CuMatrixBase<BaseFloat> *in_deriv) const {
+  if (to_update_in) {
+    LogSoftmaxComponent *to_update =
+        dynamic_cast<LogSoftmaxComponent*>(to_update_in);
+    to_update->StoreBackpropStats(out_deriv);
+  }
   if (in_deriv == NULL)
     return;
   in_deriv->DiffLogSoftmaxPerRow(out_value, out_deriv);

From 8e5f520a94920688e3cb98f34b809385225da899 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 Jan 2018 19:10:09 -0500
Subject: [PATCH 045/184] [src] Work around problem related to ungetc failures
 on ifstream

---
 src/base/io-funcs.cc | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
index 8b8662b6401..90988faf3ea 100644
--- a/src/base/io-funcs.cc
+++ b/src/base/io-funcs.cc
@@ -178,8 +178,14 @@ int PeekToken(std::istream &is, bool binary) {
   }
   int ans = is.peek();
   if (read_bracket) {
-    if (!is.unget())
+    if (!is.unget()) {
       KALDI_WARN << "Error ungetting '<' in PeekToken";
+      // Clear the bad bit.  It seems to be possible for this code to be
+      // reached, and the C++ standard is very vague on whether even a single
+      // call to unget() should succeed; see
+      // http://www.cplusplus.com/reference/istream/istream/unget/
+      is.clear();
+    }
   }
   return ans;
 }
@@ -197,7 +203,12 @@ void ExpectToken(std::istream &is, bool binary, const char *token) {
     KALDI_ERR << "Failed to read token [started at file position "
               << pos_at_start << "], expected " << token;
   }
-  if (strcmp(str.c_str(), token) != 0) {
+  // The second half of the '&&' expression below is so that if we're expecting
+  // "<Foo>", we will accept "Foo>" instead.  This is so that the model-reading
+  // code will tolerate errors in PeekToken where is.unget() failed; search for
+  // is.clear() in PeekToken() for an explanation.
+  if (strcmp(str.c_str(), token) != 0 &&
+      !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) {
     KALDI_ERR << "Expected token \"" << token << "\", got instead \""
               << str <<"\".";
   }

From b76f02abd051ec54427148fa118f6c57a394c4b4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 4 Jan 2018 19:11:00 -0500
Subject: [PATCH 046/184] [scripts] improve messages in chain training.

---
 egs/wsj/s5/steps/nnet3/chain/train.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index e52d2ecee20..011b6894938 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -216,15 +216,13 @@ def process_args(args):
             "--trainer.deriv-truncate-margin.".format(
                 args.deriv_truncate_margin))
 
-    if (not os.path.exists(args.dir)
-            or (not os.path.exists(args.dir+"/configs") and
-                not os.path.exists(args.input_model))):
-        raise Exception("This script expects {0} to exist. Also either "
-                        "--trainer.input-model option as initial 'raw' model "
-                        "(used as 0.raw in the script) should be supplied or "
-                        "{0}/configs directory which is the output of "
-                        "make_configs.py script should be provided."
-                        "".format(args.dir))
+    if (not os.path.exists(args.dir)):
+        raise Exception("This script expects --dir={0} to exist.")
+    if (not os.path.exists(args.dir+"/configs") and
+        (args.input_model is None or not os.path.exists(args.input_model))):
+        raise Exception("Either --trainer.input-model option should be supplied, "
+                        "and exist; or the {0}/configs directory should exist.")
+
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir

From 414d33c5c9198276ac6b301c3f547b08a1bc91fa Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Sun, 31 Dec 2017 12:03:33 -0800
Subject: [PATCH 047/184] [src] Make faster ApplyFloor and ApplyCeiling
 variants for Matrix and CuMatrix. (#2115)

( breaks backwards compatibility, since the signatures of these methods change).
---
 src/cudamatrix/cu-kernels.cu               |  8 +-
 src/cudamatrix/cu-vector-speed-test.cc     | 93 ++++++++++++++++++++++
 src/cudamatrix/cu-vector-test.cc           | 44 ++++++++--
 src/cudamatrix/cu-vector.cc                | 63 +++++++++------
 src/cudamatrix/cu-vector.h                 |  5 +-
 src/feat/feature-functions.cc              |  3 +-
 src/gmm/mle-diag-gmm.cc                    |  2 +-
 src/ivector/ivector-extractor.cc           |  6 +-
 src/ivector/plda.cc                        |  3 +-
 src/matrix/kaldi-vector.cc                 | 42 ++++++----
 src/matrix/kaldi-vector.h                  | 11 ++-
 src/matrix/matrix-lib-test.cc              | 17 +++-
 src/nnet2/get-feature-transform.cc         |  3 +-
 src/nnet2/nnet-precondition-online-test.cc |  5 +-
 src/nnet2/nnet-precondition-online.cc      |  3 +-
 src/nnet3/natural-gradient-online-test.cc  |  5 +-
 src/nnet3/natural-gradient-online.cc       |  3 +-
 src/nnet3/nnet-general-component.cc        |  3 +-
 src/nnet3/nnet-simple-component.cc         |  3 +-
 src/sgmm2/am-sgmm2.cc                      |  3 +-
 20 files changed, 254 insertions(+), 71 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 4ebdcf6c988..2f8f37224be 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -7,7 +7,7 @@
 //                2013  Xiaohui Zhang
 //           2013-2015  Guoguo Chen
 //           2016-2017  Shiyin Kang
-//                2017  Hossein Hadian
+//                2017  Hossein Hadian, Daniel Galvez
 
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -1879,8 +1879,7 @@ static void _apply_floor(Real* mat, Real floor_val, MatrixDim d) {
   int index = i + j * d.stride;
 
   if (i < d.cols && j < d.rows) {
-    if (mat[index] < floor_val)
-      mat[index] = floor_val;
+    mat[index] = max(mat[index], floor_val);
   }
 }
 
@@ -2036,8 +2035,7 @@ static void _apply_ceiling(Real* mat, Real ceiling_val, MatrixDim d) {
   int index = i + j * d.stride;
 
   if (i < d.cols && j < d.rows) {
-    if (mat[index] > ceiling_val)
-      mat[index] = ceiling_val;
+    mat[index] = min(mat[index], ceiling_val);
   }
 }
 
diff --git a/src/cudamatrix/cu-vector-speed-test.cc b/src/cudamatrix/cu-vector-speed-test.cc
index 7227c25c0b1..b5efda3d8de 100644
--- a/src/cudamatrix/cu-vector-speed-test.cc
+++ b/src/cudamatrix/cu-vector-speed-test.cc
@@ -1,6 +1,7 @@
 // cudamatrix/cu-vector-speed-test.cc
 
 // Copyright 2013  Johns Hopkins University (author: Daniel Povey)
+// Copyright 2017  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -256,6 +257,90 @@ template<typename Real> void TestCuVectorAddColSumMat(int32 dim, MatrixTranspose
 }
 
 
+template<typename Real> void TestCuVectorApplyFloor(int32 dim) {
+  BaseFloat time_in_secs = 0.02;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  Real threshold = RandInt(-35000, 35000) / Real(100);
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    MatrixIndexT dummy_count;
+    v.ApplyFloor(threshold, &dummy_count);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::ApplyFloor" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+
+}
+
+
+template<typename Real> void TestCuVectorApplyFloorNoCount(int32 dim) {
+  BaseFloat time_in_secs = 0.02;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  Real threshold = RandInt(-35000, 35000) / Real(100);
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    v.ApplyFloor(threshold, nullptr);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::ApplyFloor (no count variety)" << NameOf<Real>()
+	    << ", for dim = " << dim << ", speed was " << gflops
+	    << " gigaflops.";
+
+}
+
+
+template<typename Real> void TestCuVectorApplyCeiling(int32 dim) {
+  BaseFloat time_in_secs = 0.02;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  Real threshold = RandInt(-35000, 35000) / Real(100);
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    MatrixIndexT dummy_count;
+    v.ApplyCeiling(threshold, &dummy_count);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::ApplyCeiling" << NameOf<Real>() << ", for dim = "
+            << dim << ", speed was " << gflops << " gigaflops.";
+
+}
+
+
+template<typename Real> void TestCuVectorApplyCeilingNoCount(int32 dim) {
+  BaseFloat time_in_secs = 0.02;
+  CuVector<Real> v(dim);
+  v.SetRandn();
+  Real threshold = RandInt(-35000, 35000) / Real(100);
+
+  Timer tim;
+  int32 iter = 0;
+  for (;tim.Elapsed() < time_in_secs; iter++) {
+    v.ApplyCeiling(threshold, nullptr);
+  }
+
+  BaseFloat fdim = dim;
+  BaseFloat gflops = (fdim * iter) / (tim.Elapsed() * 1.0e+09);
+  KALDI_LOG << "For CuVector::ApplyCeiling (no count variety)" << NameOf<Real>()
+	    << ", for dim = " << dim << ", speed was " << gflops
+	    << " gigaflops.";
+
+}
+
+
 template<typename Real> void CudaVectorSpeedTest() {
   std::vector<int32> sizes;
   sizes.push_back(16);
@@ -296,6 +381,14 @@ template<typename Real> void CudaVectorSpeedTest() {
     TestCuVectorAddColSumMat<Real>(sizes[s], kNoTrans);
     TestCuVectorAddColSumMat<Real>(sizes[s], kTrans);
   }
+  for (int32 s = 0; s < ns; s++) {
+    TestCuVectorApplyFloor<Real>(sizes[s]);
+    TestCuVectorApplyFloorNoCount<Real>(sizes[s]);
+  }
+  for (int32 s = 0; s < ns; s++) {
+    TestCuVectorApplyCeiling<Real>(sizes[s]);
+    TestCuVectorApplyCeilingNoCount<Real>(sizes[s]);
+  }
 
 }
 
diff --git a/src/cudamatrix/cu-vector-test.cc b/src/cudamatrix/cu-vector-test.cc
index 174a2dca6bf..0aa8ae931a4 100644
--- a/src/cudamatrix/cu-vector-test.cc
+++ b/src/cudamatrix/cu-vector-test.cc
@@ -2,7 +2,7 @@
 
 // Copyright 2013 Lucas Ondel
 //           2013 Johns Hopkins University (author: Daniel Povey)
-//           2017 Hossein Hadian
+//           2017 Hossein Hadian, Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -550,8 +550,9 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
 
     Vector<Real> vector(cu_vector);
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
-    int32 i = cu_vector.ApplyFloor(floor);
-    int32 j = vector.ApplyFloor(floor);
+    MatrixIndexT i, j;
+    cu_vector.ApplyFloor(floor, &i);
+    vector.ApplyFloor(floor, &j);
 
     CuVector<Real> cu2(vector);
 
@@ -563,6 +564,21 @@ template<typename Real> void CuVectorUnitTestApplyFloor() {
   }
 }
 
+template<typename Real> void CuVectorUnitTestApplyFloorNoCount() {
+  for (int32 l = 0; l < 10; l++) {
+    int32 dim = 100 + Rand() % 700;
+    CuVector<Real> cu_vector1(dim);
+    cu_vector1.SetRandn();
+    CuVector<Real> cu_vector2(cu_vector1);
+
+    BaseFloat floor = 0.33 * (-5 + Rand() % 10);
+    MatrixIndexT dummy_count;
+    cu_vector1.ApplyFloor(floor, &dummy_count);
+    cu_vector2.ApplyFloor(floor, nullptr);
+    AssertEqual(cu_vector1, cu_vector2);
+  }
+}
+
 template<typename Real> void CuVectorUnitTestApplyCeiling() {
   for (int32 l = 0; l < 10; l++) {
     int32 dim = 100 + Rand() % 700;
@@ -571,8 +587,9 @@ template<typename Real> void CuVectorUnitTestApplyCeiling() {
 
     Vector<Real> vector(cu_vector);
     BaseFloat floor = 0.33 * (-5 + Rand() % 10);
-    int32 i = cu_vector.ApplyCeiling(floor);
-    int32 j = vector.ApplyCeiling(floor);
+    MatrixIndexT i, j;
+    cu_vector.ApplyCeiling(floor, &i);
+    vector.ApplyCeiling(floor, &j);
 
     CuVector<Real> cu2(vector);
 
@@ -584,6 +601,21 @@ template<typename Real> void CuVectorUnitTestApplyCeiling() {
   }
 }
 
+template<typename Real> void CuVectorUnitTestApplyCeilingNoCount() {
+  for (int32 l = 0; l < 10; l++) {
+    int32 dim = 100 + Rand() % 700;
+    CuVector<Real> cu_vector1(dim);
+    cu_vector1.SetRandn();
+    CuVector<Real> cu_vector2(cu_vector1);
+
+    BaseFloat floor = 0.33 * (-5 + Rand() % 10);
+    MatrixIndexT dummy_count;
+    cu_vector1.ApplyCeiling(floor, &dummy_count);
+    cu_vector2.ApplyCeiling(floor, nullptr);
+    AssertEqual(cu_vector1, cu_vector2);
+  }
+}
+
 template<typename Real> void CuVectorUnitTestApplyPow() {
   for (int32 l = 0; l < 10; l++) {
     int32 dim = 100 + Rand() % 700;
@@ -770,6 +802,8 @@ template<typename Real> void CuVectorUnitTest() {
   CuVectorUnitTestApplyExp<Real>();
   CuVectorUnitTestApplyLog<Real>();
   CuVectorUnitTestApplyFloor<Real>();
+  CuVectorUnitTestApplyFloorNoCount<Real>();
+  CuVectorUnitTestApplyCeilingNoCount<Real>();
   CuVectorUnitTestApplyCeiling<Real>();
   CuVectorUnitTestApplyPow<Real>();
   CuVectorUnitTestAddMatVec<Real>();
diff --git a/src/cudamatrix/cu-vector.cc b/src/cudamatrix/cu-vector.cc
index aa708142696..f61fd4408db 100644
--- a/src/cudamatrix/cu-vector.cc
+++ b/src/cudamatrix/cu-vector.cc
@@ -2,6 +2,7 @@
 
 // Copyright 2012-2013  Karel Vesely
 //           2012-2014  Johns Hopkins University (author: Daniel Povey)
+//                2017  Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -342,52 +343,68 @@ void CuVectorBase<Real>::ApplySoftMax() {
 }
 
 template<typename Real>
-MatrixIndexT CuVectorBase<Real>::ApplyFloor(Real floor_val) {
-  MatrixIndexT num_floored = 0;
+void CuVectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    if (dim_ == 0) return 0;
-    CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
+    if (floored_count == nullptr) {
+      if (dim_ == 0) return;
+      CuTimer tim;
+      // We are calling a function meant for matrices, by viewing the
+      // vector as a matrix with a single row.
+      ::MatrixDim dim = {1, Dim(), 1};
+      cuda_apply_floor(dimGrid, dimBlock, data_, floor_val, dim);
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloorNoCount", tim);
+    } else {
+      if (dim_ == 0) { *floored_count = 0; return; }
+      CuTimer tim;
 
-    CuVector<float> count_vec(dim_, kUndefined);
+      CuVector<float> count_vec(dim_, kUndefined);
 
-    cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());
-    num_floored = count_vec.Sum();
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim);
+      cuda_vec_apply_floor(dimGrid, dimBlock, data_, floor_val, count_vec.Data(), dim_);
+      CU_SAFE_CALL(cudaGetLastError());
+      *floored_count = count_vec.Sum();
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyFloor", tim);
+    }
   } else
 #endif
   {
-    num_floored = Vec().ApplyFloor(floor_val);
+    Vec().ApplyFloor(floor_val, floored_count);
   }
-  return num_floored;
-
 }
 
 template<typename Real>
-MatrixIndexT CuVectorBase<Real>::ApplyCeiling(Real ceiling_val) {
-  MatrixIndexT num_ceiled = 0;
+void CuVectorBase<Real>::ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
-    if (dim_ == 0) return 0;
-    CuTimer tim;
     int dimBlock(CU1DBLOCK);
     int dimGrid(n_blocks(dim_,CU1DBLOCK));
+    if (ceiled_count == nullptr) {
+      if (dim_ == 0) return;
+      CuTimer tim;
+      // We are calling a function meant for matrices, by viewing the
+      // vector as a matrix with a single row.
+      ::MatrixDim dim = {1, Dim(), 1};
+      cuda_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, dim);
+      
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeilingNoCount", tim);
+    } else {
+      if (dim_ == 0) { *ceiled_count = 0; return; }
+      CuTimer tim;
 
-    CuVector<float> count_vec(dim_, kUndefined);
+      CuVector<float> count_vec(dim_, kUndefined);
 
-    cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
-    CU_SAFE_CALL(cudaGetLastError());
-    num_ceiled = count_vec.Sum();
-    CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim);
+      cuda_vec_apply_ceiling(dimGrid, dimBlock, data_, ceiling_val, count_vec.Data(), dim_);
+      CU_SAFE_CALL(cudaGetLastError());
+      *ceiled_count = count_vec.Sum();
+      CuDevice::Instantiate().AccuProfile("CuVectorBase::ApplyCeiling", tim);
+    }
   } else
 #endif
   {
-    num_ceiled = Vec().ApplyCeiling(ceiling_val);
+    Vec().ApplyCeiling(ceiling_val, ceiled_count);
   }
-  return num_ceiled;
 }
 
 template<typename Real>
diff --git a/src/cudamatrix/cu-vector.h b/src/cudamatrix/cu-vector.h
index 2c9768b2998..69ca2ae3125 100644
--- a/src/cudamatrix/cu-vector.h
+++ b/src/cudamatrix/cu-vector.h
@@ -5,6 +5,7 @@
 //                      Lucas Ondel
 //           2013       Xiaohui Zhang
 //           2015       Guoguo Chen
+//           2017       Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -133,8 +134,8 @@ class CuVectorBase {
   void ApplySoftMax();
   void ApplyExp();
   void ApplyLog();
-  MatrixIndexT ApplyFloor(Real floor_val);
-  MatrixIndexT ApplyCeiling(Real ceiling_val);
+  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = NULL);
+  void ApplyCeiling(Real ceiling_val, MatrixIndexT *ceiled_count = NULL);
   void ApplyPow(Real power);
   Real Sum() const;
 
diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc
index efb83baf52e..4ae2550c364 100644
--- a/src/feat/feature-functions.cc
+++ b/src/feat/feature-functions.cc
@@ -321,7 +321,8 @@ void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts,
         variance.AddVec2(-1.0 / (window_frames * window_frames), cur_sum);
         // now "variance" is the variance of the features in the window,
         // around their own mean.
-        int32 num_floored = variance.ApplyFloor(1.0e-10);
+        int32 num_floored;
+	variance.ApplyFloor(1.0e-10, &num_floored);
         if (num_floored > 0 && num_frames > 1) {
           if (opts.max_warnings == warning_count) {
             KALDI_WARN << "Suppressing the remaining variance flooring "
diff --git a/src/gmm/mle-diag-gmm.cc b/src/gmm/mle-diag-gmm.cc
index bf2fcd5a0bd..48fc0d4d740 100644
--- a/src/gmm/mle-diag-gmm.cc
+++ b/src/gmm/mle-diag-gmm.cc
@@ -343,7 +343,7 @@ void MleDiagGmmUpdate(const MleDiagGmmOptions &config,
         if (config.variance_floor_vector.Dim() != 0) {
           floored = var.ApplyFloor(config.variance_floor_vector);
         } else {
-          floored = var.ApplyFloor(config.min_variance);
+          var.ApplyFloor(config.min_variance, &floored);
         }
         if (floored != 0) {
           elements_floored += floored;
diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc
index 44393e79879..aaba3837698 100644
--- a/src/ivector/ivector-extractor.cc
+++ b/src/ivector/ivector-extractor.cc
@@ -348,7 +348,8 @@ static double GetLogDetNoFailure(const SpMatrix<double> &var) {
   } catch (...) {
     Vector<double> eigs(var.NumRows());
     var.Eig(&eigs);
-    int32 floored = eigs.ApplyFloor(1.0e-20);
+    int32 floored;
+    eigs.ApplyFloor(1.0e-20, &floored);
     if (floored > 0)
       KALDI_WARN << "Floored " << floored << " eigenvalues of variance.";
     eigs.ApplyLog();
@@ -1579,7 +1580,8 @@ double IvectorExtractorStats::UpdatePrior(
   covar.Eig(&s, &P);
   KALDI_LOG << "Eigenvalues of iVector covariance range from "
             << s.Min() << " to " << s.Max();
-  int32 num_floored = s.ApplyFloor(1.0e-07);
+  int32 num_floored;
+  s.ApplyFloor(1.0e-07, &num_floored);
   if (num_floored > 0)
     KALDI_WARN << "Floored " << num_floored << " eigenvalues of covar "
                << "of iVectors.";
diff --git a/src/ivector/plda.cc b/src/ivector/plda.cc
index 748d6e8d502..d14d392e2f5 100644
--- a/src/ivector/plda.cc
+++ b/src/ivector/plda.cc
@@ -488,7 +488,8 @@ void PldaEstimator::GetOutput(Plda *plda) {
   between_var_proj.Eig(&s, &U);
 
   KALDI_ASSERT(s.Min() >= 0.0);
-  int32 n = s.ApplyFloor(0.0);
+  int32 n;
+  s.ApplyFloor(0.0, &n);
   if (n > 0) {
     KALDI_WARN << "Floored " << n << " eigenvalues of between-class "
                << "variance to zero.";
diff --git a/src/matrix/kaldi-vector.cc b/src/matrix/kaldi-vector.cc
index d3d3de47013..8cedc9c0487 100644
--- a/src/matrix/kaldi-vector.cc
+++ b/src/matrix/kaldi-vector.cc
@@ -5,6 +5,7 @@
 //                      Petr Schwarz;  Yanmin Qian;  Jan Silovsky;
 //                      Haihua Xu; Wei Shi
 //                2015  Guoguo Chen
+//                2017  Daniel Galvez
 
 
 // See ../../COPYING for clarification regarding multiple authors
@@ -811,30 +812,41 @@ void VectorBase<Real>::ApplyAbs() {
 }
 
 template<typename Real>
-MatrixIndexT VectorBase<Real>::ApplyFloor(Real floor_val) {
-  MatrixIndexT num_floored = 0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] < floor_val) {
-      data_[i] = floor_val;
-      num_floored++;
+void VectorBase<Real>::ApplyFloor(Real floor_val, MatrixIndexT *floored_count) {
+  if (floored_count == nullptr) {
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = std::max(data_[i], floor_val);
+    }
+  } else {
+    MatrixIndexT num_floored = 0;
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      if (data_[i] < floor_val) {
+	data_[i] = floor_val;
+	num_floored++;
+      }
     }
+    *floored_count = num_floored;
   }
-  return num_floored;
 }
 
 template<typename Real>
-MatrixIndexT VectorBase<Real>::ApplyCeiling(Real ceil_val) {
-  MatrixIndexT num_changed = 0;
-  for (MatrixIndexT i = 0; i < dim_; i++) {
-    if (data_[i] > ceil_val) {
-      data_[i] = ceil_val;
-      num_changed++;
+void VectorBase<Real>::ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count) {
+  if (ceiled_count == nullptr) {
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      data_[i] = std::min(data_[i], ceil_val);
+    }
+  } else {
+    MatrixIndexT num_changed = 0;
+    for (MatrixIndexT i = 0; i < dim_; i++) {
+      if (data_[i] > ceil_val) {
+	data_[i] = ceil_val;
+	num_changed++;
+      }
     }
+    *ceiled_count = num_changed;
   }
-  return num_changed;
 }
 
-
 template<typename Real>
 MatrixIndexT VectorBase<Real>::ApplyFloor(const VectorBase<Real> &floor_vec) {
   KALDI_ASSERT(floor_vec.Dim() == dim_);
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index dcfdd47b09c..3eb4a932095 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -6,6 +6,7 @@
 //                       Karel Vesely;  Go Vivace Inc.;  Arnab Ghoshal
 //                       Wei Shi;
 //                2015   Guoguo Chen
+//                2017   Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -133,11 +134,13 @@ class VectorBase {
   /// Take absolute value of each of the elements
   void ApplyAbs();
 
-  /// Applies floor to all elements. Returns number of elements floored.
-  MatrixIndexT ApplyFloor(Real floor_val);
+  /// Applies floor to all elements. Returns number of elements
+  /// floored in floored_count if it is non-null.
+  void ApplyFloor(Real floor_val, MatrixIndexT *floored_count = nullptr);
 
-  /// Applies ceiling to all elements. Returns number of elements changed.
-  MatrixIndexT ApplyCeiling(Real ceil_val);
+  /// Applies ceiling to all elements. Returns number of elements
+  /// changed in ceiled_count if it is non-null.
+  void ApplyCeiling(Real ceil_val, MatrixIndexT *ceiled_count = nullptr);
 
   /// Applies floor to all elements. Returns number of elements floored.
   MatrixIndexT ApplyFloor(const VectorBase<Real> &floor_vec);
diff --git a/src/matrix/matrix-lib-test.cc b/src/matrix/matrix-lib-test.cc
index b97c70dbbdf..ef82ee7ed02 100644
--- a/src/matrix/matrix-lib-test.cc
+++ b/src/matrix/matrix-lib-test.cc
@@ -6,6 +6,7 @@
 //                       Johns Hopkins University (Author: Daniel Povey);
 //                       Haihua Xu; Wei Shi
 //                2015   Guoguo Chen
+//                2017   Daniel Galvez
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -2286,8 +2287,10 @@ template<typename Real> static void  UnitTestFloorCeiling() {
     v.SetRandn();
     Real pivot = v(5);
     Vector<Real> f(v), f2(v), c(v), c2(v);
-    MatrixIndexT floored2 = f.ApplyFloor(pivot),
-        ceiled2 = c.ApplyCeiling(pivot);
+    MatrixIndexT floored2;
+    f.ApplyFloor(pivot, &floored2);
+    MatrixIndexT ceiled2;
+    c.ApplyCeiling(pivot, &ceiled2);
     MatrixIndexT floored = 0, ceiled = 0;
     for (MatrixIndexT d = 0; d < dimM; d++) {
       if (f2(d) < pivot) { f2(d) = pivot; floored++; }
@@ -2297,6 +2300,16 @@ template<typename Real> static void  UnitTestFloorCeiling() {
     AssertEqual(c, c2);
     KALDI_ASSERT(floored == floored2);
     KALDI_ASSERT(ceiled == ceiled2);
+
+    // Check that the non-counting variants are equivalent to the counting
+    // variants.
+    Vector<Real> f3(v);
+    f3.ApplyFloor(pivot);
+    AssertEqual(f2, f3);
+
+    Vector<Real> c3(v);
+    c3.ApplyCeiling(pivot);
+    AssertEqual(c2, c3);
   }
 }
 
diff --git a/src/nnet2/get-feature-transform.cc b/src/nnet2/get-feature-transform.cc
index 3f348d2de76..38ec9bc3da9 100644
--- a/src/nnet2/get-feature-transform.cc
+++ b/src/nnet2/get-feature-transform.cc
@@ -111,7 +111,8 @@ void FeatureTransformEstimate::EstimateInternal(
     Vector<BaseFloat> s(min_dim);
     M->Svd(&s, &U, &Vt); // decompose m = U diag(s) Vt.
     BaseFloat max_s = s.Max();
-    int32 n = s.ApplyCeiling(opts.max_singular_value);
+    int32 n;
+    s.ApplyCeiling(opts.max_singular_value, &n);
     if (n > 0) {
       KALDI_LOG << "Applied ceiling to " << n << " out of " << s.Dim()
                 << " singular values of transform using ceiling "
diff --git a/src/nnet2/nnet-precondition-online-test.cc b/src/nnet2/nnet-precondition-online-test.cc
index 30f9a33ef5e..b0306db72a2 100644
--- a/src/nnet2/nnet-precondition-online-test.cc
+++ b/src/nnet2/nnet-precondition-online-test.cc
@@ -170,7 +170,8 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
   Z_t.Eig(&c_t, &U_t);
   SortSvd(&c_t, &U_t);
   double c_t_floor = pow(rho_t_ * (1.0 - eta), 2);
-  int32 nf = c_t.ApplyFloor(c_t_floor);
+  int32 nf;
+  c_t.ApplyFloor(c_t_floor, &nf);
   if (nf > 0) {
     KALDI_WARN << "Floored " << nf << " elements of c_t.";
   }
@@ -198,7 +199,7 @@ void OnlinePreconditionerSimple::PreconditionDirectionsCpu(
     KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
     rho_t1 = floor_val;
   }
-  nf = d_t1.ApplyFloor(floor_val);
+  d_t1.ApplyFloor(floor_val, &nf);
   if (nf > 0) {
     KALDI_VLOG(3) << "d_t1 was " << d_t1;
     KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
diff --git a/src/nnet2/nnet-precondition-online.cc b/src/nnet2/nnet-precondition-online.cc
index 7154548f175..51e7c5b13c6 100644
--- a/src/nnet2/nnet-precondition-online.cc
+++ b/src/nnet2/nnet-precondition-online.cc
@@ -416,7 +416,8 @@ void OnlinePreconditioner::PreconditionDirectionsInternal(
   bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
 
   BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
-  int32 nf = c_t.ApplyFloor(c_t_floor);
+  int32 nf;
+  c_t.ApplyFloor(c_t_floor, &nf);
   if (nf > 0)
     must_reorthogonalize = true;
   if (nf > 0 && self_debug_) {
diff --git a/src/nnet3/natural-gradient-online-test.cc b/src/nnet3/natural-gradient-online-test.cc
index 2829d4ebde7..445cc43f868 100644
--- a/src/nnet3/natural-gradient-online-test.cc
+++ b/src/nnet3/natural-gradient-online-test.cc
@@ -170,7 +170,8 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
   Z_t.Eig(&c_t, &U_t);
   SortSvd(&c_t, &U_t);
   double c_t_floor = pow(rho_t_ * (1.0 - eta), 2);
-  int32 nf = c_t.ApplyFloor(c_t_floor);
+  int32 nf;
+  c_t.ApplyFloor(c_t_floor, &nf);
   if (nf > 0) {
     KALDI_WARN << "Floored " << nf << " elements of c_t.";
   }
@@ -198,7 +199,7 @@ void OnlineNaturalGradientSimple::PreconditionDirectionsCpu(
     KALDI_WARN << "flooring rho_{t+1} to " << floor_val << ", was " << rho_t1;
     rho_t1 = floor_val;
   }
-  nf = d_t1.ApplyFloor(floor_val);
+  d_t1.ApplyFloor(floor_val, &nf);
   if (nf > 0) {
     KALDI_VLOG(3) << "d_t1 was " << d_t1;
     KALDI_WARN << "Floored " << nf << " elements of d_{t+1}.";
diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc
index 19a7d5fafdc..b5740053f46 100644
--- a/src/nnet3/natural-gradient-online.cc
+++ b/src/nnet3/natural-gradient-online.cc
@@ -406,7 +406,8 @@ void OnlineNaturalGradient::PreconditionDirectionsInternal(
   bool must_reorthogonalize = (c_t(0) > condition_threshold * c_t(R - 1));
 
   BaseFloat c_t_floor = pow(rho_t * (1 - eta), 2);
-  int32 nf = c_t.ApplyFloor(c_t_floor);
+  int32 nf;
+  c_t.ApplyFloor(c_t_floor, &nf);
   if (nf > 0)
     must_reorthogonalize = true;
   if (nf > 0 && self_debug_) {
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index bc7405f2836..dd6e950a7d1 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1127,7 +1127,8 @@ void BackpropTruncationComponent::Backprop(const std::string &debug_info,
                               kNoTrans, 0.0);
   // now clipping_scales contains the squared (norm of each row divided by
   //  clipping_threshold)
-  int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
+  int32 num_not_scaled;
+  clipping_scales.ApplyFloor(1.0, &num_not_scaled);
   // now clipping_scales contains min(1, squared-(norm/clipping_threshold))
   clipping_scales.ApplyPow(-0.5);
   // now clipping_scales contains max(1, clipping_threshold/vector_norm)
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 5bd6ffeee32..ea5a2489bc4 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -617,7 +617,8 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
                                   kNoTrans, 0.0);
      // now clipping_scales contains the squared (norm of each row divided by
      //  clipping_threshold)
-      int32 num_not_scaled = clipping_scales.ApplyFloor(1.0);
+      int32 num_not_scaled;
+      clipping_scales.ApplyFloor(1.0, &num_not_scaled);
      // now clipping_scales contains min(1,
      //    squared-(norm/clipping_threshold))
       if (num_not_scaled != clipping_scales.Dim()) {
diff --git a/src/sgmm2/am-sgmm2.cc b/src/sgmm2/am-sgmm2.cc
index 86623a12ca2..d249a5ab8b2 100644
--- a/src/sgmm2/am-sgmm2.cc
+++ b/src/sgmm2/am-sgmm2.cc
@@ -1045,7 +1045,8 @@ void AmSgmm2::ComputeFmllrPreXform(const Vector<BaseFloat> &state_occs,
   tmpB.Eig(diag_mean_scatter, &U);  // Eq. (B.5): B = U D V^T
 
   int32 n;
-  if ((n = diag_mean_scatter->ApplyFloor(1.0e-04)) != 0)
+  diag_mean_scatter->ApplyFloor(1.0e-04, &n);
+  if (n != 0)
     KALDI_WARN << "Floored " << n << " elements of the mean-scatter matrix.";
 
   // Eq. (B.6): A_{pre} = U^T * L^{-1}

From 47f9cc1c61da7b7e1f969006dda1b3c84c31f137 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Sun, 31 Dec 2017 12:59:23 -0800
Subject: [PATCH 048/184] [scripts] Fig bug in validate_data_dir.sh introduced
 in df7a41978f2. Thx:@jcsilva

---
 egs/wsj/s5/utils/validate_data_dir.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh
index 11f1db806b3..dbbaeb10d5d 100755
--- a/egs/wsj/s5/utils/validate_data_dir.sh
+++ b/egs/wsj/s5/utils/validate_data_dir.sh
@@ -164,8 +164,8 @@ if [ -f $data/wav.scp ]; then
     if [ -f $data/text ]; then
       ! cmp -s $tmpdir/utts <(awk '{print $1}' <$data/segments) && \
         echo "$0: Utterance list differs between $data/utt2spk and $data/segments " && \
-        echo "$0: Lengths are $segments_len vs $num_utts";
-      exit 1
+        echo "$0: Lengths are $segments_len vs $num_utts" && \
+        exit 1
     fi
 
     cat $data/segments | awk '{print $2}' | sort | uniq > $tmpdir/recordings

From d884c01da8f2e51cfe383f85432f40b645618163 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Sun, 31 Dec 2017 13:05:56 -0800
Subject: [PATCH 049/184] [scripts] Fix lattice_oracle_align.sh bug (Thx:
 @roman-verbit-ai)

---
 egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
index 8cac7263d78..29d52588807 100755
--- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
+++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -74,7 +74,7 @@ oov=$(cat $lang/oov.int)
 
 utils/split_data.sh --per-utt $data $nj
 
-sdata=$data/split$nj
+sdata=$data/split${nj}utt
 
 if [ $stage -le 1 ]; then
   $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \

From 3e57faafb20344e24399e8bff0a58af39e69109c Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 1 Jan 2018 12:27:56 -0800
Subject: [PATCH 050/184] [build] Add new search dir for ATLAS (Thx: Sari
 Sultan)

---
 src/configure | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/configure b/src/configure
index bec077b4a92..fc07cb2fc70 100755
--- a/src/configure
+++ b/src/configure
@@ -748,7 +748,7 @@ function linux_check_dynamic {
 function linux_configure_dynamic {
   if $threaded_atlas; then pt=t; else pt=s; fi # relevant to "fat" libraries, will change later for separate ones
   if [ -z $ATLASLIBDIR ]; then # Note: it'll pick up the last one below.
-    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3} \
+    for dir in /usr{,/local}/lib{,64}{,/atlas,/atlas-sse2,/atlas-sse3,/x86_64-linux-gnu} \
       `pwd`/../tools/ATLAS/build/install/lib/ $ATLASROOT/lib; do
       linux_check_dynamic && ATLASLIBDIR=$dir && ATLASLIBNAME=$atlas_libname
     done

From 30b623f84def68cdaecd4cc4367e54eba1e38301 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Tue, 2 Jan 2018 12:25:33 -0800
Subject: [PATCH 051/184] [scripts] Fix script issue affecting some xvector
 training (thanks: daniel garcia-romero)

---
 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 9f9b5752ce6..72b776351f6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -531,7 +531,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
                     {egs_rspecifier} ark:- | \
-                      nnet3-merge-egs --minibatch-size={mbsize} ark:- ark:- |" \
+                      nnet3-merge-egs --minibatch-size=1:{mbsize} ark:- ark:- |" \
                 "{out_model}"
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,

From 51abf1c617254b013bdf4fe200ec86e616095d20 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Thu, 4 Jan 2018 04:58:20 +0800
Subject: [PATCH 052/184] [egs] Improve comments for fisher_swbd tdnn_lstm_1a
 (#2122)

---
 egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh      |  2 ++
 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh  | 11 ++++++++---
 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh |  2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index e1df22ede91..9810a03ee58 100644
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
 
 # The model training procedure is similar to run_blstm_6j.sh under egs/swbd/s5c
 
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index 81af4a128e8..d057470552f 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
-# same as run_tdnn_opgru_1a.sh, but replacing Norm-OPGRU with LSTMP.
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
+
+# Same as run_tdnn_opgru_1a.sh, but replacing Norm-OPGRU with LSTMP.
+# Also Batchnorm in TDNN layers does not reduce the WER in Fisher+SWBD, so in run_tdnn_lstm_1a.sh,
+# I just apply renorm component in TDNN layers.
 # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp
 # System                tdnn_lstm_1a_sp
 # WER on eval2000(tg)        12.3
@@ -15,7 +20,7 @@
 # Final train prob (xent)        -0.882
 # Final valid prob (xent)       -0.9393
 
-# ./show_chain_wer.sh tdnn_lstm_1b_sp
+# ./show_chain_wer.sh tdnn_lstm_1a_sp
 # %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
 # %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
 # %WER 8.4 | 1831 21395 | 92.7 5.1 2.2 1.1 8.4 42.3 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
@@ -23,7 +28,7 @@
 # %WER 12.1 | 4459 42989 | 89.6 6.9 3.5 1.7 12.1 49.2 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
 # %WER 8.2 | 1831 21395 | 93.1 5.1 1.8 1.3 8.2 41.7 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.swbd.filt.sys
 
-# ./show_chain_wer_rt03.sh tdnn_lstm_1b_sp
+# ./show_chain_wer_rt03.sh tdnn_lstm_1a_sp
 # %WER 9.6 | 3970 36721 | 91.5 5.5 3.0 1.1 9.6 41.2 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
 # %WER 11.6 | 8420 76157 | 89.7 6.8 3.4 1.4 11.6 43.0 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
 # %WER 13.3 | 4450 39436 | 88.0 7.4 4.6 1.3 13.3 44.5 | exp/chain/tdnn_lstm_1a_sp/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index e819a987a48..2de8d774451 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -1,4 +1,6 @@
 #!/bin/bash
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
 
 # This is based on TDNN_LSTM_1b (from egs/swbd/s5c), but using the NormOPGRU to replace the LSTMP,
 # and adding chunk-{left,right}-context-initial=0

From 1a383decc5a6c99dd94a13579e95753aff5a499b Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Fri, 5 Jan 2018 00:59:47 +0330
Subject: [PATCH 053/184] [egs] Add OCR/Handwriting Recognition examples
 (#1984)

* OCR: Add IAM corpus with unk decoding support (#3)

* Add a new English OCR database 'UW3'

* Some minor fixes re IAM corpus

* Fix an issue in IAM chain recipes + add a new recipe (#6)

* Some fixes based on the pull request review

* Various fixes + cleaning on IAM

* Fix LM estimation and add extended dictionary + other minor fixes

* Add README for IAM

* Add output filter for scoring

* Fix a bug RE switch to pyhton3

* Add updated results + minor fixes

* Remove unk decoding -- gives almost no gain

* Add UW3 OCR database

* Fix cmd.sh in IAM + fix usages of train/decode_cmd in chain recipes

* Various minor fixes on UW3

* Rename iam/s5 to iam/v1

* Add README file for UW3

* Various cosmetic fixes on UW3 scripts

* Minor fixes in IAM
---
 egs/iam/README.txt                            |   4 +
 egs/iam/v1/cmd.sh                             |  13 +
 egs/iam/v1/image                              |   1 +
 egs/iam/v1/local/chain/compare_wer.sh         |  59 +++++
 egs/iam/v1/local/chain/run_cnn_1a.sh          | 235 +++++++++++++++++
 egs/iam/v1/local/chain/run_cnn_chainali_1a.sh | 244 +++++++++++++++++
 egs/iam/v1/local/chain/run_cnn_chainali_1b.sh | 245 ++++++++++++++++++
 egs/iam/v1/local/make_features.py             |  87 +++++++
 egs/iam/v1/local/prepare_data.sh              | 149 +++++++++++
 egs/iam/v1/local/prepare_dict.sh              |  49 ++++
 egs/iam/v1/local/process_data.py              |  82 ++++++
 egs/iam/v1/local/score.sh                     |   5 +
 egs/iam/v1/local/train_lm.sh                  | 139 ++++++++++
 egs/iam/v1/local/wer_output_filter            |  27 ++
 egs/iam/v1/path.sh                            |   6 +
 egs/iam/v1/run.sh                             | 122 +++++++++
 egs/iam/v1/steps                              |   1 +
 egs/iam/v1/utils                              |   1 +
 egs/uw3/README.txt                            |   4 +
 egs/uw3/v1/cmd.sh                             |  13 +
 egs/uw3/v1/image                              |   1 +
 egs/uw3/v1/local/chain/compare_wer.sh         |  72 +++++
 egs/uw3/v1/local/chain/run_cnn_1a.sh          | 234 +++++++++++++++++
 egs/uw3/v1/local/make_features.py             |  97 +++++++
 egs/uw3/v1/local/prepare_data.sh              |  40 +++
 egs/uw3/v1/local/prepare_dict.sh              |  29 +++
 egs/uw3/v1/local/process_data.py              |  61 +++++
 egs/uw3/v1/local/score.sh                     | 156 +++++++++++
 egs/uw3/v1/local/train_lm.sh                  | 102 ++++++++
 .../v1/local/unk_arc_post_to_transcription.py |  86 ++++++
 egs/uw3/v1/path.sh                            |   6 +
 egs/uw3/v1/run.sh                             | 106 ++++++++
 egs/uw3/v1/steps                              |   1 +
 egs/uw3/v1/utils                              |   1 +
 34 files changed, 2478 insertions(+)
 create mode 100644 egs/iam/README.txt
 create mode 100644 egs/iam/v1/cmd.sh
 create mode 120000 egs/iam/v1/image
 create mode 100755 egs/iam/v1/local/chain/compare_wer.sh
 create mode 100755 egs/iam/v1/local/chain/run_cnn_1a.sh
 create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
 create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
 create mode 100755 egs/iam/v1/local/make_features.py
 create mode 100755 egs/iam/v1/local/prepare_data.sh
 create mode 100755 egs/iam/v1/local/prepare_dict.sh
 create mode 100755 egs/iam/v1/local/process_data.py
 create mode 100755 egs/iam/v1/local/score.sh
 create mode 100755 egs/iam/v1/local/train_lm.sh
 create mode 100755 egs/iam/v1/local/wer_output_filter
 create mode 100755 egs/iam/v1/path.sh
 create mode 100755 egs/iam/v1/run.sh
 create mode 120000 egs/iam/v1/steps
 create mode 120000 egs/iam/v1/utils
 create mode 100644 egs/uw3/README.txt
 create mode 100644 egs/uw3/v1/cmd.sh
 create mode 120000 egs/uw3/v1/image
 create mode 100755 egs/uw3/v1/local/chain/compare_wer.sh
 create mode 100755 egs/uw3/v1/local/chain/run_cnn_1a.sh
 create mode 100755 egs/uw3/v1/local/make_features.py
 create mode 100755 egs/uw3/v1/local/prepare_data.sh
 create mode 100755 egs/uw3/v1/local/prepare_dict.sh
 create mode 100755 egs/uw3/v1/local/process_data.py
 create mode 100755 egs/uw3/v1/local/score.sh
 create mode 100755 egs/uw3/v1/local/train_lm.sh
 create mode 100755 egs/uw3/v1/local/unk_arc_post_to_transcription.py
 create mode 100755 egs/uw3/v1/path.sh
 create mode 100755 egs/uw3/v1/run.sh
 create mode 120000 egs/uw3/v1/steps
 create mode 120000 egs/uw3/v1/utils

diff --git a/egs/iam/README.txt b/egs/iam/README.txt
new file mode 100644
index 00000000000..daeb67af541
--- /dev/null
+++ b/egs/iam/README.txt
@@ -0,0 +1,4 @@
+
+This directory contains example scripts for handwriting recognition on
+the IAM dataset:
+http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
diff --git a/egs/iam/v1/cmd.sh b/egs/iam/v1/cmd.sh
new file mode 100644
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/iam/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/iam/v1/image b/egs/iam/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/iam/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..4eb665fc702
--- /dev/null
+++ b/egs/iam/v1/local/chain/compare_wer.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh
new file mode 100755
index 00000000000..3b1571091c1
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_1a.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+
+# Copyright    2017 Hossein Hadian
+#              2017 Chun Chieh Chang
+#              2017 Ashish Arora
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_1a/
+# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
+
+# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_*
+# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0
+# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+alignment_subsampling_factor=1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/$lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${train_data_dir} \
+    data/$lang_test $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
new file mode 100755
index 00000000000..2c8b6c91e5a
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1a.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# chainali_1a is as 1a except it uses chain alignments (using 1a system) instead of gmm alignments
+
+# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_1a/
+# System                      cnn_chainali_1a    cnn_1a
+# WER                             6.69     9.13
+# Final train prob              -0.0128   -0.0297
+# Final valid prob              -0.0447   -0.0975
+# Final train prob (xent)       -0.6448   -0.5915
+# Final valid prob (xent)       -0.9924   -1.0022
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1a/
+# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.002->0.000 xent:train/valid[13,20,final]=(-0.929,-0.711,-0.645/-1.16,-1.04,-0.992) logprob:train/valid[13,20,final]=(-0.029,-0.016,-0.013/-0.051,-0.047,-0.045)
+
+# cat exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_*
+# %WER 3.94 [ 2600 / 65921, 549 ins, 837 del, 1214 sub ] exp/chain/cnn_chainali_1a/decode_test/cer_15_0.0
+# %WER 6.69 [ 1241 / 18542, 135 ins, 358 del, 748 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_15_0.5
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1a  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+alignment_subsampling_factor=1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/$lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  common1="height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
new file mode 100755
index 00000000000..ddf596a6126
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
+# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/
+# System                      cnn_chainali_1a cnn_chainali_1b
+# WER                              6.69     6.25
+# Final train prob              -0.0132   -0.0041
+# Final valid prob              -0.0509   -0.0337
+# Final train prob (xent)       -0.6393   -0.6287
+# Final valid prob (xent)       -1.0116   -0.9064
+
+# steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
+# exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
+
+# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_*
+# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0
+# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+alignment_subsampling_factor=1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/$lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
new file mode 100755
index 00000000000..b998464953f
--- /dev/null
+++ b/egs/iam/v1/local/make_features.py
@@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels.
+
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('dir', type=str,
+                    help='Source data directory (containing images.scp)')
+parser.add_argument('--out-ark', type=str, default='-',
+                    help='Where to write the output feature file')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+args = parser.parse_args()
+
+
+def write_kaldi_matrix(file_handle, matrix, key):
+    file_handle.write(key + " [ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to "
+                            "have the same length")
+        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file_handle.write("\n")
+    file_handle.write(" ]\n")
+
+def get_scaled_image(im):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx)
+    im = misc.imresize(im, (nx, ny))
+    padding_x = args.padding
+    padding_y = im.shape[0]
+    im_pad = np.concatenate((255 * np.ones((padding_y, padding_x),
+                                           dtype=int), im), axis=1)
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x),
+                                                    dtype=int)), axis=1)
+    return im_pad1
+
+### main ###
+data_list_path = os.path.join(args.dir,'images.scp')
+
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        im_scale = get_scaled_image(im)
+
+        data = np.transpose(im_scale, (1, 0))
+        data = np.divide(data, 255.0)
+        write_kaldi_matrix(out_fh, data, image_id)
diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh
new file mode 100755
index 00000000000..1350c5841df
--- /dev/null
+++ b/egs/iam/v1/local/prepare_data.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+# Apache 2.0
+
+# This script downloads the IAM handwriting database and prepares the training
+# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also downloads the LOB and Brown text corpora. It downloads the database files
+# only if they do not already exist in download directory.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+#      utt2spk file: 000_a01-000u-00 000
+#      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+#      spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03
+
+stage=0
+download_dir=data/download
+username=
+password=       # username and password for downloading the IAM database
+                # if you have not already downloaded the database, please
+                # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
+                # and provide this script with your username and password.
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
+  echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files"
+  echo "exist in your data/local directory this script will fail because the required files"
+  echo "can't be downloaded automatically (it needs registration)."
+  echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
+  echo "... and then call this script again with --username <username> --password <password>"
+  echo ""
+fi
+
+lines=data/local/lines
+xml=data/local/xml
+ascii=data/local/ascii
+bcorpus=data/local/browncorpus
+lobcorpus=data/local/lobcorpus
+data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
+lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
+xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
+data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
+ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
+brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
+lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
+mkdir -p $download_dir data/local
+
+# download and extact images and transcription
+if [ -d $lines ]; then
+  echo "$0: Not downloading lines images as it is already there."
+else
+  if [ ! -f $download_dir/lines.tgz ]; then
+    echo "$0: Trying to download lines images..."
+    wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1;
+  fi
+  mkdir -p $lines
+  tar -xzf $download_dir/lines.tgz -C $lines || exit 1;
+  echo "$0: Done downloading and extracting lines images"
+fi
+
+if [ -d $xml ]; then
+  echo "$0: Not downloading transcriptions as it is already there."
+else
+  if [ ! -f $download_dir/xml.tgz ]; then
+    echo "$0: Trying to download transcriptions..."
+    wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1;
+  fi
+  mkdir -p $xml
+  tar -xzf $download_dir/xml.tgz -C $xml || exit 1;
+  echo "$0: Done downloading and extracting transcriptions."
+fi
+
+if [ -d $data_split_info ]; then
+  echo "$0: Not downloading data split information as it is already there."
+else
+  if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then
+    echo "$0: Trying to download training and testing data split information..."
+    wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1;
+  fi
+  mkdir -p $data_split_info
+  unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1;
+  echo "$0: Done downloading and extracting training and testing data split information"
+fi
+
+if [ -d $ascii ]; then
+  echo "$0: Not downloading ascii.tgz as it is already there."
+else
+  if [ ! -f $download_dir/ascii.tgz ]; then
+    echo "$0: trying to download ascii.tgz..."
+    wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1;
+  fi
+  mkdir -p $ascii
+  tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1;
+  echo "$0: Done downloading and extracting ascii.tgz"
+fi
+
+if [ -d $lobcorpus ]; then
+  echo "$0: Not downloading the LOB text corpus as it is already there."
+else
+  if [ ! -f $lobcorpus/0167.zip ]; then
+    echo "$0: Downloading the LOB text corpus ..."
+    mkdir -p $lobcorpus
+    wget -P $lobcorpus/ $lob_corpus_url || exit 1;
+  fi
+  unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1;
+  echo "$0: Done downloading and extracting LOB corpus"
+fi
+
+if [ -d $bcorpus ]; then
+  echo "$0: Not downloading the Brown corpus as it is already there."
+else
+  if [ ! -f $bcorpus/brown.txt ]; then
+    mkdir -p $bcorpus
+    echo "$0: Downloading the Brown text corpus..."
+    wget -P $bcorpus $brown_corpus_url || exit 1;
+  fi
+  echo "$0: Done downloading the Brown text corpus"
+fi
+
+mkdir -p data/{train,test,val}
+file_name=largeWriterIndependentTextLineRecognitionTask
+
+train_old="data/local/$file_name/trainset.txt"
+test_old="data/local/$file_name/testset.txt"
+val1_old="data/local/$file_name/validationset1.txt"
+val2_old="data/local/$file_name/validationset2.txt"
+
+train_new="data/local/train.uttlist"
+test_new="data/local/test.uttlist"
+val_new="data/local/validation.uttlist"
+
+cat $train_old > $train_new
+cat $test_old > $test_new
+cat $val1_old $val2_old > $val_new
+
+if [ $stage -le 0 ]; then
+  local/process_data.py data/local data/train --dataset train || exit 1
+  local/process_data.py data/local data/test --dataset test || exit 1
+  local/process_data.py data/local data/val --dataset validation || exit 1
+
+  utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+  utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
+fi
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..77a46df384f
--- /dev/null
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+mkdir -p $dir
+
+# First get the set of all letters that occur in data/train/text
+cat data/train/text | \
+  perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \
+  sort -u > $dir/nonsilence_phones.txt
+
+# Now list all the unique words (that use only the above letters)
+# in data/train/text and LOB+Brown corpora with their comprising
+# letters as their transcription. (Letter # is replaced with <HASH>)
+
+export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n")
+
+cut -d' ' -f2- data/train/text | \
+  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \
+      data/local/browncorpus/brown.txt - | \
+  perl -e '$letters=$ENV{letters};
+while(<>){ @A = split;
+  foreach(@A) {
+    if(! $seen{$_} && $_ =~ m/^[$letters]+$/){
+      $seen{$_} = 1;
+      $trans = join(" ", split(//));
+      $trans =~ s/#/<HASH>/g;
+      print "$_ $trans\n";
+    }
+  }
+}' | sort > $dir/lexicon.txt
+
+
+sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+echo '<unk> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/iam/v1/local/process_data.py b/egs/iam/v1/local/process_data.py
new file mode 100755
index 00000000000..fa5eb484707
--- /dev/null
+++ b/egs/iam/v1/local/process_data.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script reads the extracted IAM database files and creates
+    the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+
+  Eg. local/process_data.py data/local data/train data --dataset train
+  Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from
+      utt2spk file: 000_a01-000u-00 000
+      images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png
+"""
+
+import argparse
+import os
+import sys
+import xml.dom.minidom as minidom
+
+parser = argparse.ArgumentParser(description="""Creates text, utt2spk
+                                                and images.scp files.""")
+parser.add_argument('database_path', type=str,
+                    help='Path to the downloaded (and extracted) IAM data')
+parser.add_argument('out_dir', type=str,
+                    help='Where to write output files.')
+parser.add_argument('--dataset', type=str, default='train',
+                    choices=['train', 'test','validation'],
+                    help='Subset of data to process.')
+args = parser.parse_args()
+
+text_file = os.path.join(args.out_dir + '/', 'text')
+text_fh = open(text_file, 'w')
+
+utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w')
+
+image_file = os.path.join(args.out_dir + '/', 'images.scp')
+image_fh = open(image_file, 'w')
+
+dataset_path = os.path.join(args.database_path,
+                            args.dataset + '.uttlist')
+
+text_file_path = os.path.join(args.database_path,
+                              'ascii','lines.txt')
+text_dict = {}
+def process_text_file_for_word_model():
+  with open (text_file_path, 'rt') as in_file:
+    for line in in_file:
+      if line[0]=='#':
+        continue
+      line = line.strip()
+      utt_id = line.split(' ')[0]
+      text_vect = line.split(' ')[8:]
+      text = "".join(text_vect)
+      text = text.replace("|", " ")
+      text_dict[utt_id] = text
+
+print("Processing '{}' data...".format(args.dataset))
+process_text_file_for_word_model()
+
+with open(dataset_path) as f:
+  for line in f:
+    line = line.strip()
+    line_vect = line.split('-')
+    xml_file = line_vect[0] + '-' + line_vect[1]
+    xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml')
+    img_num = line[-3:]
+    doc = minidom.parse(xml_path)
+
+    form_elements = doc.getElementsByTagName('form')[0]
+    writer_id = form_elements.getAttribute('writer-id')
+    outerfolder = form_elements.getAttribute('id')[0:3]
+    innerfolder = form_elements.getAttribute('id')
+    lines_path = os.path.join(args.database_path, 'lines',
+                              outerfolder, innerfolder, innerfolder)
+    image_file_path = lines_path + img_num + '.png'
+    text =  text_dict[line]
+    utt_id = writer_id + '_' + line
+    text_fh.write(utt_id + ' ' + text + '\n')
+    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+    image_fh.write(utt_id + ' ' + image_file_path + '\n')
diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh
new file mode 100755
index 00000000000..31564d25326
--- /dev/null
+++ b/egs/iam/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
new file mode 100755
index 00000000000..aa4303d6a28
--- /dev/null
+++ b/egs/iam/v1/local/train_lm.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains an LM on the LOB+Brown text data and IAM training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Using LOB and brown corpus.
+  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt
+  cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt
+
+  # use the validation data as the dev set.
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+
+  cat data/val/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/iam.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from IAM text
+  cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=2 iam=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+  #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 1 million n-grams for a big LM for rescoring purposes.
+  size=1000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words
+  # current results, after adding --limit-unk-history=true:
+
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 500,000 n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=500000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words
+  # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst):
+
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/iam/v1/local/wer_output_filter b/egs/iam/v1/local/wer_output_filter
new file mode 100755
index 00000000000..162482539ed
--- /dev/null
+++ b/egs/iam/v1/local/wer_output_filter
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+# Copyright      2017  Hossein Hadian
+
+# This is a filter used in scoring. It separates all
+# punctuations from words. For e.g. this sentence:
+
+# "They have come!" he said reverently, gripping his
+# hands. "Isn't it a glorious thing! Long awaited."
+
+# is converted to this:
+
+# " They have come ! " he said reverently , gripping his
+# hands . " Isn ' t it a glorious thing ! Long awaited . "
+
+import sys
+import re
+
+punctuations = "!(),.?;:'-\""
+escaped_punctuations = re.escape(punctuations)
+
+for line in sys.stdin:
+  words = line.strip().split()
+  uttid = words[0]
+  transcript = ' '.join(words[1:])
+  split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations),
+                                       transcript)).strip()
+  print("{} {}".format(uttid, split_transcript))
diff --git a/egs/iam/v1/path.sh b/egs/iam/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/iam/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
new file mode 100755
index 00000000000..c8ebb9ae649
--- /dev/null
+++ b/egs/iam/v1/run.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+
+set -e
+stage=0
+nj=20
+
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+iam_database=/export/corpora5/handwriting_ocr/IAM
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+if [ $stage -le 0 ]; then
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$iam_database"
+fi
+mkdir -p data/{train,test}/data
+
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing the test and train feature files..."
+  for dataset in train test; do
+    local/make_features.py data/$dataset --feat-dim 40 | \
+      copy-feats --compress=true --compression-method=7 \
+                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
+                        data/local/dict "<unk>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+if [ $stage -le 4 ]; then
+  steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \
+    data/lang exp/mono
+fi
+
+if [ $stage -le 5 ]; then
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
+    exp/mono/decode_test
+fi
+
+if [ $stage -le 6 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+    exp/mono exp/mono_ali
+
+  steps/train_deltas.sh --cmd $cmd 500 20000 data/train data/lang \
+    exp/mono_ali exp/tri
+fi
+
+if [ $stage -le 7 ]; then
+  utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
+    exp/tri/decode_test
+fi
+
+if [ $stage -le 8 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \
+    exp/tri exp/tri_ali
+
+  steps/train_lda_mllt.sh --cmd $cmd \
+    --splice-opts "--left-context=3 --right-context=3" 500 20000 \
+    data/train data/lang exp/tri_ali exp/tri2
+fi
+
+if [ $stage -le 9 ]; then
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+
+  steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
+    data/test exp/tri2/decode_test
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
+    data/train data/lang exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd $cmd 500 20000 \
+    data/train data/lang exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+
+  steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
+    data/test exp/tri3/decode_test
+fi
+
+if [ $stage -le 12 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \
+    data/train data/lang exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 13 ]; then
+  local/chain/run_cnn_1a.sh
+fi
+
+if [ $stage -le 14 ]; then
+  local/chain/run_cnn_chainali_1b.sh --chain-model-dir exp/chain/cnn_1a --stage 2
+fi
diff --git a/egs/iam/v1/steps b/egs/iam/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/iam/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/iam/v1/utils b/egs/iam/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/iam/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/uw3/README.txt b/egs/uw3/README.txt
new file mode 100644
index 00000000000..b02d00ff541
--- /dev/null
+++ b/egs/uw3/README.txt
@@ -0,0 +1,4 @@
+
+This directory contains example scripts for optical character recognition
+(i.e. OCR) on the UW3 dataset (it's a printed English OCR corpus):
+http://isis-data.science.uva.nl/events/dlia//datasets/uwash3.html
diff --git a/egs/uw3/v1/cmd.sh b/egs/uw3/v1/cmd.sh
new file mode 100644
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/uw3/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/uw3/v1/image b/egs/uw3/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/uw3/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/uw3/v1/local/chain/compare_wer.sh b/egs/uw3/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..1a40523355a
--- /dev/null
+++ b/egs/uw3/v1/local/chain/compare_wer.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# ./local/chain/compare_wer.sh exp/chain/cnn1a
+# System                          cnn1a
+# WER                              0.61
+# CER                              0.15
+# Final train prob              -0.0377
+# Final valid prob              -0.0380
+# Final train prob (xent)       -0.0830
+# Final valid prob (xent)       -0.0838
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/uw3/v1/local/chain/run_cnn_1a.sh b/egs/uw3/v1/local/chain/run_cnn_1a.sh
new file mode 100755
index 00000000000..ad7421e1261
--- /dev/null
+++ b/egs/uw3/v1/local/chain/run_cnn_1a.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Copyright    2017 Hossein Hadian
+#              2017 Chun Chieh Chang
+#              2017 Ashish Arora
+
+# steps/info/chain_dir_info.pl exp/chain/cnn1a/
+# exp/chain/cnn1a/: num-iters=153 nj=3..10 num-params=3.6M dim=40->268 combine=-0.034->-0.034 xent:train/valid[101,152,final]=(-0.097,-0.186,-0.092/-0.101,-0.212,-0.098) logprob:train/valid[101,152,final]=(-0.035,-0.067,-0.035/-0.036,-0.082,-0.035)
+
+# cat exp/chain/cnn1a/decode_test/scoring_kaldi/best_*
+# %WER 0.19 [ 366 / 188135, 110 ins, 123 del, 133 sub ] exp/chain/cnn1a/decode_test/cer_7_0.5
+# %WER 1.00 [ 357 / 35571, 104 ins, 26 del, 227 sub ] exp/chain/cnn1a/decode_test/wer_5_1.0
+
+
+set -e -o pipefail
+
+stage=0
+nj=30
+
+# affix for exp dirs, e.g. it was _cleaned in tedlium.
+nnet3_affix=
+
+affix=1a
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=5
+
+# training chunk-options
+chunk_width=340,300,200,100
+
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=false
+
+gmm_dir=exp/tri2
+ali_dir=exp/tri2_ali
+lat_dir=exp/chain${nnet3_affix}/tri2_train_lats
+dir=exp/chain${nnet3_affix}/cnn${affix}
+train_data_dir=data/train
+lores_train_data_dir=$train_data_dir  # for the start, use the same data for gmm and chain
+gmm_lang=data/lang
+lang_test=data/lang_unk
+tree_dir=exp/chain${nnet3_affix}/tree${affix}
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+for f in $train_data_dir/feats.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt $lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$cmd" ${lores_train_data_dir} \
+    $lang_test $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" 300 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=12"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=40 time-offsets=-2,-1,0,1,2 $common1
+  relu-batchnorm-layer name=tdnn1 input=Append(-2,-1,0,1,2) dim=450
+  relu-batchnorm-layer name=tdnn2 input=Append(-5,0,5) dim=450
+  relu-batchnorm-layer name=tdnn3 input=Append(-5,0,5) dim=450
+  relu-batchnorm-layer name=tdnn4 input=Append(-5,0,5) dim=450
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=450 target-rms=0.5
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=450 target-rms=0.5
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$frame_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=3 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/uw3/v1/local/make_features.py b/egs/uw3/v1/local/make_features.py
new file mode 100755
index 00000000000..dd0a30a19d7
--- /dev/null
+++ b/egs/uw3/v1/local/make_features.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels.
+
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+from scipy import ndimage
+
+from signal import signal, SIGPIPE, SIG_DFL
+signal(SIGPIPE,SIG_DFL)
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('dir', type=str, help='data directory (should contain images.scp)')
+parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file.')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='size to scale the height of all images (i.e. the dimension of the resulting features)')
+parser.add_argument('--pad', type=bool, default=False, help='pad the left and right of the images with 10 white pixels.')
+
+args = parser.parse_args()
+
+
+def write_kaldi_matrix(file_handle, matrix, key):
+    file_handle.write(key + " [ ")
+    num_rows = len(matrix)
+    if num_rows == 0:
+        raise Exception("Matrix is empty")
+    num_cols = len(matrix[0])
+    for row_index in range(len(matrix)):
+        if num_cols != len(matrix[row_index]):
+            raise Exception("All the rows of a matrix are expected to "
+                            "have the same length")
+        file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index])))
+        if row_index != num_rows - 1:
+            file_handle.write("\n")
+    file_handle.write(" ]\n")
+
+def get_scaled_image(im):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    # Some Images are rotated
+    if sy > sx:
+        im = np.rot90(im, k = -1)
+        sx = im.shape[1]
+        sy = im.shape[0]
+
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale_size)
+    ny = int(scale * sx)
+    im = misc.imresize(im, (nx, ny))
+
+    noise = np.random.normal(2, 1,(nx, ny))
+    im = im - noise
+
+    return im
+
+### main ###
+data_list_path = os.path.join(args.dir,'images.scp')
+
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+
+        im = misc.imread(image_path, flatten = True)
+        im_scale = get_scaled_image(im)
+
+        if args.pad:
+            pad = np.ones((args.feat_dim, 10)) * 255
+            im_data = np.hstack((pad, im_scale, pad))
+        else:
+            im_data = im_scale
+
+        data = np.transpose(im_data, (1, 0))
+        data = np.divide(data, 255.0)
+        write_kaldi_matrix(out_fh, data, image_id)
diff --git a/egs/uw3/v1/local/prepare_data.sh b/egs/uw3/v1/local/prepare_data.sh
new file mode 100755
index 00000000000..47f62e4335a
--- /dev/null
+++ b/egs/uw3/v1/local/prepare_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright 2017 Chun Chieh Chang
+
+# This script downloads the UW3 dataset (if not already downloaded)
+# and prepares the "train" and "test" data subsets.
+
+set -e
+download_dir=data/download
+
+. ./cmd.sh
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+# Download dir
+download_url=http://www.tmbdev.net/ocrdata/uw3-lines-book.tgz
+data_dir=data/local/extracted_corpus
+
+mkdir -p $download_dir
+mkdir -p $data_dir
+
+if [ -d $data_dir/book ]; then
+  echo "$0: Not downloading dataset as it is already downloaded."
+else
+  if [ ! -f $download_dir/uw3-lines-book.tgz ]; then
+    echo "$0: Downloading dataset..."
+    wget -P $download_dir $download_url || exit 1;
+  fi
+  echo "$0: Extracting..."
+  tar -xzf $download_dir/uw3-lines-book.tgz -C $data_dir/ || exit 1;
+  echo "$0: Done downloading/extracting the datset."
+fi
+
+mkdir -p data/train
+mkdir -p data/test
+echo "$0: Preparing the test and train subsets..."
+local/process_data.py $data_dir/book data || exit 1
+
+utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt
+utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt
diff --git a/egs/uw3/v1/local/prepare_dict.sh b/egs/uw3/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..72c9b50e5ec
--- /dev/null
+++ b/egs/uw3/v1/local/prepare_dict.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Hossein Hadian
+
+set -e
+dir=data/local/dict
+
+mkdir -p $dir
+
+cut -d' ' -f2- data/train/text | tr -cs '[a-z][A-Z][0-9][:punct:]' '\n' | sort -u | \
+  awk '{len=split($0,chars,""); printf($0);
+       for (i=0;i<=len;i++) {
+         if(chars[i]=="#") {chars[i]="<HASH>"}
+         printf(chars[i]" ")
+       };
+       printf("\n")};' | \
+  sed 's/.$//' > $dir/lexicon.txt;
+
+cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+echo '<unk> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL > $dir/optional_silence.txt
+
+echo -n "" > $dir/extra_questions.txt
diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py
new file mode 100755
index 00000000000..f5b37b04c2f
--- /dev/null
+++ b/egs/uw3/v1/local/process_data.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python3
+
+# Copyright       2017  Chun Chieh Chang
+
+# This script goes through the downloaded UW3 dataset and creates data files "text",
+# "utt2spk", and "images.scp" for the train and test subsets in data/train and data/test.
+
+# text - matches the transcriptions with the image id
+# utt2spk - matches the image id's with the speaker/writer names
+# images.scp - matches the image is's with the actual image file
+
+import argparse
+import os
+import random
+
+parser = argparse.ArgumentParser(description="""Creates data/train and data/test.""")
+parser.add_argument('database_path', type=str, help='path to downloaded (and extracted) UW3 corpus')
+parser.add_argument('out_dir', type=str, default='data',
+                    help='where to create the train and test data directories')
+args = parser.parse_args()
+
+### main ###
+train_text_file = os.path.join(args.out_dir, 'train', 'text')
+train_text_fh = open(train_text_file, 'w+')
+train_utt2spk_file = os.path.join(args.out_dir, 'train', 'utt2spk')
+train_utt2spk_fh = open(train_utt2spk_file, 'w+')
+train_image_file = os.path.join(args.out_dir, 'train', 'images.scp')
+train_image_fh = open(train_image_file, 'w+')
+
+test_text_file = os.path.join(args.out_dir, 'test', 'text')
+test_text_fh = open(test_text_file, 'w+')
+test_utt2spk_file = os.path.join(args.out_dir, 'test', 'utt2spk')
+test_utt2spk_fh = open(test_utt2spk_file, 'w+')
+test_image_file = os.path.join(args.out_dir, 'test', 'images.scp')
+test_image_fh = open(test_image_file, 'w+')
+
+random.seed(0)
+page_count = 0
+for page in sorted(os.listdir(args.database_path)):
+  page_path = os.path.join(args.database_path, page)
+  page_count = page_count + 1
+  for line in sorted(os.listdir(page_path)):
+    if line.endswith('.txt'):
+      text_path = os.path.join(args.database_path, page, line)
+      image_name = line.split('.')[0]
+      image_path = os.path.join(args.database_path, page, image_name + '.png')
+      utt_id = page + '_' + image_name
+      gt_fh = open(text_path, 'r')
+      text = gt_fh.readlines()[0].strip()
+
+      # The UW3 dataset doesn't have established training and testing splits
+      # The dataset is randomly split train 95% and test 5%
+      coin = random.randint(0, 20)
+      if coin >= 1:
+        train_text_fh.write(utt_id + ' ' + text + '\n')
+        train_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n')
+        train_image_fh.write(utt_id + ' ' + image_path + '\n')
+      elif coin < 1:
+        test_text_fh.write(utt_id + ' ' + text + '\n')
+        test_utt2spk_fh.write(utt_id + ' ' + str(page_count) + '\n')
+        test_image_fh.write(utt_id + ' ' + image_path + '\n')
diff --git a/egs/uw3/v1/local/score.sh b/egs/uw3/v1/local/score.sh
new file mode 100755
index 00000000000..9ea4701a833
--- /dev/null
+++ b/egs/uw3/v1/local/score.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
+
+# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the <unk>'s
+# using local/unk_arc_post_to_transcription.py and also it calls
+# steps/scoring/score_kaldi_cer.sh at the end.
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=3
+max_lmwt=13
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+model_path=`echo $dir |xargs dirname`
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-1best ark:- ark:- \| \
+        lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \
+        lattice-arc-post $model_path/final.mdl ark:- - \| \
+        local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      tr '[:upper:]' '[:lower:]' \| \
+      compute-wer --text --mode=present \
+      "ark:cat $dir/scoring_kaldi/test_filt.txt| tr '[:upper:]' '[:lower:]' |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/uw3/v1/local/train_lm.sh b/egs/uw3/v1/local/train_lm.sh
new file mode 100755
index 00000000000..39eb051d273
--- /dev/null
+++ b/egs/uw3/v1/local/train_lm.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+#
+#
+# This script trains an LM on the UW3 training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+dir=data/local/local_lm
+lm_dir=${dir}/data
+
+. ./path.sh
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=4500
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  head -n $num_dev_sentences < data/train/text | cut -d " " -f 2-  > ${dir}/data/text/dev.txt
+  tail -n +$[$num_dev_sentences+1] < data/train/text | cut -d " " -f 2- >  ${dir}/data/text/uw3.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (a subset of the training data is used as ${dir}/data/text/uw3.txt to work
+  # out interpolation weights.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get wordlist
+  cat ${dir}/data/text/uw3.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=2 uw3=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20  \
+               --limit-unk-history=true \
+               --fold-dev-into=uw3 ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  # No need for pruning as the training data is quite small (total # of
+  # n-grams is 685k). Write the arpa:
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/uw3/v1/local/unk_arc_post_to_transcription.py b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
new file mode 100755
index 00000000000..c86d35e4b8a
--- /dev/null
+++ b/egs/uw3/v1/local/unk_arc_post_to_transcription.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+# Copyright     2017  Ashish Arora
+
+import argparse
+import sys
+
+parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
+parser.add_argument('phones', type=str, help='phones and phonesID')
+parser.add_argument('words', type=str, help='word and wordID')
+parser.add_argument('unk', type=str, default='-', help='location of unk file')
+parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
+parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+args = parser.parse_args()
+### main ###
+phone_fh = open(args.phones, 'r')
+word_fh = open(args.words, 'r')
+unk_fh = open(args.unk,'r')
+if args.input_ark == '-':
+    input_fh = sys.stdin
+else:
+    input_fh = open(args.input_ark,'r')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+phone_dict = dict()# stores phoneID and phone mapping
+phone_data_vect = phone_fh.read().strip().split("\n")
+for key_val in phone_data_vect:
+  key_val = key_val.split(" ")
+  phone_dict[key_val[1]] = key_val[0]
+word_dict = dict()
+word_data_vect = word_fh.read().strip().split("\n")
+for key_val in word_data_vect:
+  key_val = key_val.split(" ")
+  word_dict[key_val[1]] = key_val[0]
+unk_val = unk_fh.read().strip().split(" ")[0]
+
+utt_word_dict = dict()
+utt_phone_dict = dict()# stores utteranceID and phoneID
+unk_word_dict = dict()
+count=0
+for line in input_fh:
+  line_vect = line.strip().split("\t")
+  if len(line_vect) < 6:
+    print "IndexError"
+    print line_vect
+    continue
+  uttID = line_vect[0]
+  word = line_vect[4]
+  phones = line_vect[5]
+  if uttID in utt_word_dict.keys():
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  else:
+    count = 0
+    utt_word_dict[uttID] = dict()
+    utt_phone_dict[uttID] = dict()
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  if word == unk_val: # get character sequence for unk
+    phone_key_vect = phones.split(" ")
+    phone_val_vect = list()
+    for pkey in phone_key_vect:
+      phone_val_vect.append(phone_dict[pkey])
+    phone_2_word = list()
+    for phone_val in phone_val_vect:
+      phone_2_word.append(phone_val.split('_')[0])
+    phone_2_word = ''.join(phone_2_word)
+    utt_word_dict[uttID][count] = phone_2_word
+  else:
+    if word == '0':
+      word_val = ' '
+    else:
+      word_val = word_dict[word]
+    utt_word_dict[uttID][count] = word_val
+  count += 1
+
+transcription = ""
+for key in sorted(utt_word_dict.iterkeys()):
+  transcription = key
+  for index in sorted(utt_word_dict[key].iterkeys()):
+    value = utt_word_dict[key][index]
+    transcription = transcription + " " + value
+  out_fh.write(transcription + '\n')
diff --git a/egs/uw3/v1/path.sh b/egs/uw3/v1/path.sh
new file mode 100755
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/uw3/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/uw3/v1/run.sh b/egs/uw3/v1/run.sh
new file mode 100755
index 00000000000..68c51fa4690
--- /dev/null
+++ b/egs/uw3/v1/run.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+
+
+set -e
+stage=0
+nj=30
+
+# This is the database path on the JHU grid. You may set this
+# to data/download, in which case the script will automatically download
+# the database:
+uw3_database=/export/a10/corpora5/handwriting_ocr/UW3/
+
+. ./path.sh
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. utils/parse_options.sh  # e.g. this parses the --stage option if supplied.
+
+
+if [ $stage -le 0 ]; then
+  # Data preparation
+  local/prepare_data.sh --download-dir "$uw3_database"
+fi
+
+mkdir -p data/{train,test}/data
+if [ $stage -le 1 ]; then
+  echo "$0: Preparing feature files for the test and training data..."
+  for f in train test; do
+    local/make_features.py --feat-dim 40 --pad true data/$f | \
+      copy-feats --compress=true --compression-method=7 \
+      ark:- ark,scp:data/$f/data/images.ark,data/$f/feats.scp || exit 1
+
+    steps/compute_cmvn_stats.sh data/$f || exit 1;
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
+    data/local/dict "<unk>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+
+  echo "$0: Preparing the unk model for open-vocab decoding..."
+  utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \
+                            data/local/dict exp/unk_lang_model
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
+                        --unk-fst exp/unk_lang_model/unk_fst.txt \
+                        data/local/dict "<unk>" data/lang_unk/temp data/lang_unk
+  cp data/lang_test/G.fst data/lang_unk/G.fst
+fi
+
+if [ $stage -le 4 ]; then
+  steps/train_mono.sh --nj $nj --cmd $cmd \
+    data/train data/lang exp/mono
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd \
+    data/train data/lang exp/mono exp/mono_ali
+  steps/train_deltas.sh --cmd $cmd 500 20000 \
+    data/train data/lang exp/mono_ali exp/tri
+fi
+
+if [ $stage -le 6 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd \
+    data/train data/lang exp/tri exp/tri_ali
+  steps/train_lda_mllt.sh --cmd $cmd --splice-opts "--left-context=3 --right-context=3" 500 20000 \
+    data/train data/lang exp/tri_ali exp/tri2
+fi
+
+if [ $stage -le 7 ]; then
+  utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
+  steps/decode.sh --nj $nj --cmd $cmd \
+    exp/mono/graph data/test exp/mono/decode_test
+fi
+
+if [ $stage -le 8 ]; then
+  utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
+  steps/decode.sh --nj $nj --cmd $cmd \
+    exp/tri/graph data/test exp/tri/decode_test
+fi
+
+if [ $stage -le 9 ]; then
+  utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
+  steps/decode.sh --nj $nj --cmd $cmd \
+    exp/tri2/graph data/test exp/tri2/decode_test
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_si.sh --nj $nj --cmd $cmd --use-graphs true \
+    data/train data/lang exp/tri2 exp/tri2_ali
+fi
+
+if [ $stage -le 11 ]; then
+  local/chain/run_cnn_1a.sh
+fi
diff --git a/egs/uw3/v1/steps b/egs/uw3/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/uw3/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/uw3/v1/utils b/egs/uw3/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/uw3/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From 8f79b01faf8a392d170ff11ddad0733d6cb38c0d Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 4 Jan 2018 22:54:45 -0800
Subject: [PATCH 054/184] [src,scripts] Remove BatchNormComponent 'power'
 option

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  7 +--
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   |  7 +--
 src/nnet3/nnet-normalize-component.cc         | 59 ++++++-------------
 src/nnet3/nnet-normalize-component.h          | 10 +---
 src/nnet3/nnet-test-utils.cc                  |  1 -
 5 files changed, 24 insertions(+), 60 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index c59e4a6041e..0e27e6369e4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -674,7 +674,6 @@ def set_default_configs(self):
                        'bottleneck-dim': -1,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
-                       'batchnorm-power': -0.5,
                        'ng-affine-options': '',
                        'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
@@ -754,7 +753,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
         output_dim = self.output_dim()
         self_repair_scale = self.config['self-repair-scale']
         target_rms = self.config['target-rms']
-        batchnorm_power = self.config['batchnorm-power']
 
         affine_options = self.config['ng-affine-options']
         for opt_name in [ 'max-change', 'learning-rate-factor',
@@ -845,10 +843,9 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
 
             elif nonlinearity == 'batchnorm':
                 line = ('component name={0}.{1}'
-                        ' type=BatchNormComponent dim={2}'
-                        ' target-rms={3} power={4}'
+                        ' type=BatchNormComponent dim={2} target-rms={3}'
                         ''.format(self.name, nonlinearity, output_dim,
-                                  target_rms, batchnorm_power))
+                                  target_rms))
 
             elif nonlinearity == 'so':
                 line = ('component name={0}.{1}'
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 85454795435..67537f574e4 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -829,9 +829,6 @@ def set_default_configs(self):
                         'clipping-threshold': 30.0,
                         'zeroing-interval': 20,
                         'zeroing-threshold': 15.0,
-                        # batchnorm-power is for what i'm going to call OverNorm, you can set it
-                        # for example to -0.75.
-                        'batchnorm-power': -0.5,
                         'delay' : -1,
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the recurrence scale is the scale on m_trunc, used in the
@@ -945,8 +942,8 @@ def _generate_lstm_config(self):
 
         configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format(
             name, 2 * cell_dim, bptrunc_str))
-        configs.append("component name={0}.m_batchnorm type=BatchNormComponent power={1} dim={2} ".format(
-            name, self.config['batchnorm-power'], cell_dim))
+        configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format(
+            name, cell_dim))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, "
diff --git a/src/nnet3/nnet-normalize-component.cc b/src/nnet3/nnet-normalize-component.cc
index e6be8210bb0..507a30d1aa1 100644
--- a/src/nnet3/nnet-normalize-component.cc
+++ b/src/nnet3/nnet-normalize-component.cc
@@ -234,8 +234,9 @@ void BatchNormComponent::ComputeDerived() {
   // of numerical roundoff.
   scale_.ApplyFloor(0.0);
   scale_.Add(epsilon_);
-  scale_.ApplyPow(power_);
-  // now scale_ = min(variance, epsilon)^power_
+  BaseFloat power = -0.5;
+  scale_.ApplyPow(power);
+  // now scale_ = min(variance, epsilon)^power
   // next, multiply by the target RMS (normally 1.0).
   scale_.Scale(target_rms_);
   offset_.MulElements(scale_);
@@ -253,7 +254,7 @@ void BatchNormComponent::Check() const {
 }
 
 BatchNormComponent::BatchNormComponent(const BatchNormComponent &other):
-    dim_(other.dim_), block_dim_(other.block_dim_), power_(other.power_),
+    dim_(other.dim_), block_dim_(other.block_dim_),
     epsilon_(other.epsilon_), target_rms_(other.target_rms_),
     test_mode_(other.test_mode_), count_(other.count_),
     stats_sum_(other.stats_sum_), stats_sumsq_(other.stats_sumsq_) {
@@ -267,7 +268,6 @@ std::string BatchNormComponent::Info() const {
   stream << Type() << ", dim=" << dim_ << ", block-dim=" << block_dim_
          << ", epsilon=" << epsilon_ << ", target-rms=" << target_rms_
          << ", count=" << count_
-         << ", power=" << power_
          << ", test-mode=" << (test_mode_ ? "true" : "false");
   if (count_ > 0) {
     Vector<BaseFloat> mean(stats_sum_), var(stats_sumsq_);
@@ -286,14 +286,12 @@ std::string BatchNormComponent::Info() const {
 void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
   dim_ = -1;
   block_dim_ = -1;
-  power_ = -0.5;
   epsilon_ = 1.0e-03;
   target_rms_ = 1.0;
   test_mode_ = false;
   bool ok = cfl->GetValue("dim", &dim_);
   cfl->GetValue("block-dim", &block_dim_);
   cfl->GetValue("epsilon", &epsilon_);
-  cfl->GetValue("power", &power_);
   cfl->GetValue("target-rms", &target_rms_);
   cfl->GetValue("test-mode", &test_mode_);
   if (!ok || dim_ <= 0) {
@@ -307,8 +305,6 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
-  if (power_ >= 0 || power_ <= -1.0)
-    KALDI_ERR << "Power has invalid value " << power_;
   count_ = 0;
   stats_sum_.Resize(block_dim_);
   stats_sumsq_.Resize(block_dim_);
@@ -393,13 +389,12 @@ void BatchNormComponent::InitFromConfig(ConfigLine *cfl) {
   BACKWARD PASS (recap):
 
    var_deriv_mod = 2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+                .. which for power = -0.5, simplifies to:
+   var_deriv_mod = -1.0 / (target-rms^2) * (1/I \sum_i z'(i) z(i)) * scale
 
            x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
 
   */
-
-
-
 void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
                                     const CuMatrixBase<BaseFloat> &in,
                                     CuMatrixBase<BaseFloat> *out) const {
@@ -435,15 +430,16 @@ void* BatchNormComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
     mean.AddRowSumMat(1.0 / num_frames, in, 0.0);
     uvar.AddDiagMat2(1.0 / num_frames, in, kTrans, 0.0);
     scale.CopyFromVec(uvar);
+
     // by applying this scale at this point, we save a multiply later on.
-    BaseFloat var_scale = std::pow(target_rms_, 1.0 / power_);
+    BaseFloat var_scale = 1.0 / (target_rms_ * target_rms_);
     scale.AddVecVec(-var_scale, mean, mean, var_scale);
-    // at this point, 'scale' contains just the variance (times target-rms^{-power})
+    // at this point, 'scale' contains just the variance (times target-rms^{-2}).
     scale.ApplyFloor(0.0);
     scale.Add(var_scale * epsilon_);
     // Now 'scale' contains the variance floored to zero and then with epsilon
-    // added [both times target-rms^{-power}]
-    scale.ApplyPow(power_);
+    // added [both times 1/target-rms^2].
+    scale.ApplyPow(-0.5);
     // now 'scale' is the actual scale we'll use.
 
     // the next command will do no work if out == in, for in-place propagation.
@@ -509,18 +505,19 @@ void BatchNormComponent::Backprop(
     KALDI_ASSERT(out_value.NumRows() == num_frames);
     CuSubVector<BaseFloat>
         scale(memo->mean_uvar_scale, 2),
-        temp(memo->mean_uvar_scale, 4),
         var_deriv_mod(memo->mean_uvar_scale, 3),
-        scale_pow(memo->mean_uvar_scale, 4);
+        temp(memo->mean_uvar_scale, 4);
 
     // var_deriv_mod is going to contain:
     //  2 * power * target-rms^{1/power} * (1/I \sum_i z'(i) z(i)) * scale^{-(1+power)/power}
+    // which for power = -0.5 simplifies to:
+    // -1.0 / (target_rms * target_rms).
     // but for now we don't have the power of 'scale', we'll add that later.
-    BaseFloat coeff = 2.0 * power_ * std::pow(target_rms_, 1.0 / power_) /
-        num_frames;
+    BaseFloat coeff = -1.0 / (target_rms_ * target_rms_ * num_frames);
+
     var_deriv_mod.AddDiagMatMat(coeff, out_value, kTrans,
                                 out_deriv, kNoTrans, 0.0);
-
+    var_deriv_mod.MulElements(scale);
 
     temp.AddRowSumMat(-1.0 / num_frames, out_deriv, 0.0);
     // the following statement does no work if in_deriv and out_deriv are the
@@ -533,19 +530,9 @@ void BatchNormComponent::Backprop(
     // At this point, *in_deriv contains
     // scale * (z'(i) - 1/I * \sum_i z'(i))
 
-    // The next few lines complete the calculation of 'var_deriv_mod';
-    // we delayed it because we were using 'temp', and 'scale_pow'
-    // uses the same memory.
-    if (power_ == -0.5) {
-      // we can simplify scale^{-(1+power)/power} to just 'scale'.
-      var_deriv_mod.MulElements(scale);
-    } else {
-      scale_pow.CopyFromVec(scale);
-      scale_pow.ApplyPow(-1.0 * (1.0 + power_) / power_);
-      var_deriv_mod.MulElements(scale_pow);
-    }
     in_deriv->AddMatDiagVec(1.0, out_value, kNoTrans,
                             var_deriv_mod, 1.0);
+
     // At this point, *in_deriv contains what we described in the comment
     // starting BATCHNORM_MATH as:
     // x'(i) = scale * (z'(i) - 1/I * \sum_i z'(i))  + z(i) var_deriv_mod
@@ -602,12 +589,6 @@ void BatchNormComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &dim_);
   ExpectToken(is, binary, "<BlockDim>");
   ReadBasicType(is, binary, &block_dim_);
-  if (PeekToken(is, binary) == 'P') {
-    ExpectToken(is, binary, "<Power>");
-    ReadBasicType(is, binary, &power_);
-  } else {
-    power_ = -0.5;
-  }
   ExpectToken(is, binary, "<Epsilon>");
   ReadBasicType(is, binary, &epsilon_);
   ExpectToken(is, binary, "<TargetRms>");
@@ -635,10 +616,6 @@ void BatchNormComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dim_);
   WriteToken(os, binary, "<BlockDim>");
   WriteBasicType(os, binary, block_dim_);
-  if (power_ != -0.5) {
-    WriteToken(os, binary, "<Power>");
-    WriteBasicType(os, binary, power_);
-  }
   WriteToken(os, binary, "<Epsilon>");
   WriteBasicType(os, binary, epsilon_);
   WriteToken(os, binary, "<TargetRms>");
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index b10c3e4a60c..1806fe38493 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -232,8 +232,8 @@ class BatchNormComponent: public Component {
     // 'sum_sumsq_scale' is of dimension 5 by block_dim_:
     // Row 0 = mean = the mean of the rows of the input
     // Row 1 = uvar = the uncentered variance of the input (= sumsq / num_frames).
-    // Row 2 = scale = the scale of the renormalization, which is
-    // Rows 3 and 4 are used as a temporaries in Backprop.
+    // Row 2 = scale = the scale of the renormalization.
+    // Rows 3 and 4 are used as temporaries in Backprop.
     CuMatrix<BaseFloat> mean_uvar_scale;
   };
 
@@ -260,12 +260,6 @@ class BatchNormComponent: public Component {
   // always will in the new code in nnet-convolutional-component.h.
   int32 block_dim_;
 
-
-  // This power determines the scale as a power of the variance... the default
-  // (-0.5) corresponds to regular BatchNorm, but you can set it to other
-  // values, like -0.25 or -0.4, for what we'll call "fractional BatchNorm"
-  BaseFloat power_;
-
   // Used to avoid exact-zero variances, epsilon has the dimension of a
   // covariance.
   BaseFloat epsilon_;
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 472a02197e5..48a97df9ea1 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -1681,7 +1681,6 @@ static void GenerateRandomComponentConfig(std::string *component_type,
          << " block-dim=" << block_dim << " target-rms="
          << RandInt(1, 4) << " test-mode="
          << (test_mode ? "true" : "false")
-         << " power=" << (-0.1 * RandInt(3, 5))
          << " epsilon=" << (RandInt(0, 1) == 0 ? "0.1" : "1.0");
       break;
     }

From d5660c54d8da285dfadf499249cf310e42cda841 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 5 Jan 2018 21:20:59 -0500
Subject: [PATCH 055/184] [egs] Add newly tuned mini-librispeech example with
 factored output layer

---
 .../s5/local/chain/run_tdnn.sh                |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1f.sh      | 308 ++++++++++++++++++
 2 files changed, 309 insertions(+), 1 deletion(-)
 create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh

diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index 75da1a0a553..cb5756188a4 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1e.sh
\ No newline at end of file
+tuning/run_tdnn_1f.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
new file mode 100755
index 00000000000..67b8b56ea49
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -0,0 +1,308 @@
+#!/bin/bash
+
+# 1f is as 1e but a smaller model with various tuning changes, the most
+#  important of which is the 'bottleneck-dim' option for the last layer;
+#  also dimensions are reduced and we've removed the 'target-rms=0.5' options
+#  on the prefinal layers.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
+# System                tdnn1e_sp tdnn1f_sp
+#WER dev_clean_2 (tgsmall)      13.84     13.92
+#             [online:]         13.82     14.01
+#WER dev_clean_2 (tglarge)      10.17      9.83
+#             [online:]         10.25      9.96
+# Final train prob        -0.0500   -0.0515
+# Final valid prob        -0.0870   -0.0889
+# Final train prob (xent)   -1.4168   -1.3739
+# Final valid prob (xent)   -1.6861   -1.6125
+# Num-params                 7553634   3976418
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp
+# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.6M dim=40+100->2353 combine=-0.055->-0.055 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.47,-1.42/-1.95,-1.73,-1.69) logprob:train/valid[10,16,final]=(-0.066,-0.054,-0.050/-0.100,-0.091,-0.087)
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.0M dim=40+100->2353 combine=-0.060->-0.059 (over 2) xent:train/valid[10,16,final]=(-1.64,-1.43,-1.37/-1.85,-1.66,-1.61) logprob:train/valid[10,16,final]=(-0.069,-0.057,-0.052/-0.104,-0.094,-0.089)
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1f   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=384
+  relu-batchnorm-layer name=tdnn2 $opts dim=384 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=384
+  relu-batchnorm-layer name=tdnn4 $opts dim=384 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=384
+  relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=384 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=384
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=384
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=10 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;

From fd1248d5e8a5f7b1899f7c6c85b059bc7de1aa46 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 6 Jan 2018 18:55:23 -0500
Subject: [PATCH 056/184] [egs] Improvement to mini-librispeech 1f example

---
 .../s5/local/chain/tuning/run_tdnn_1f.sh      | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index 67b8b56ea49..58852b61aa8 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -1,25 +1,28 @@
 #!/bin/bash
 
+
 # 1f is as 1e but a smaller model with various tuning changes, the most
 #  important of which is the 'bottleneck-dim' option for the last layer;
 #  also dimensions are reduced and we've removed the 'target-rms=0.5' options
 #  on the prefinal layers.
 #
-# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
-# System                tdnn1e_sp tdnn1f_sp
-#WER dev_clean_2 (tgsmall)      13.84     13.92
-#             [online:]         13.82     14.01
-#WER dev_clean_2 (tglarge)      10.17      9.83
-#             [online:]         10.25      9.96
-# Final train prob        -0.0500   -0.0515
-# Final valid prob        -0.0870   -0.0889
-# Final train prob (xent)   -1.4168   -1.3739
-# Final valid prob (xent)   -1.6861   -1.6125
-# Num-params                 7553634   3976418
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp
-# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.6M dim=40+100->2353 combine=-0.055->-0.055 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.47,-1.42/-1.95,-1.73,-1.69) logprob:train/valid[10,16,final]=(-0.066,-0.054,-0.050/-0.100,-0.091,-0.087)
-# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.0M dim=40+100->2353 combine=-0.060->-0.059 (over 2) xent:train/valid[10,16,final]=(-1.64,-1.43,-1.37/-1.85,-1.66,-1.61) logprob:train/valid[10,16,final]=(-0.069,-0.057,-0.052/-0.104,-0.094,-0.089)
+# local/chain/compare_wer.sh --online exp/chain/tdnn1{e,f}_sp 2>/dev/null
+# local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
+# System                tdnn1e_sp tdnn1f7_sp
+#WER dev_clean_2 (tgsmall)      14.11     13.91
+#             [online:]         14.07     13.96
+#WER dev_clean_2 (tglarge)      10.15      9.95
+#             [online:]         10.16     10.13
+# Final train prob        -0.0503   -0.0508
+# Final valid prob        -0.0887   -0.0917
+# Final train prob (xent)   -1.4257   -1.3509
+# Final valid prob (xent)   -1.6799   -1.5883
+# Num-params                 7508490   4205322
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f7}_sp
+# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089)
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092)
 
 
 # Set -e here so that we catch if any executable fails immediately
@@ -175,7 +178,7 @@ if [ $stage -le 13 ]; then
   relu-batchnorm-layer name=tdnn5 $opts dim=384
   relu-batchnorm-layer name=tdnn6 $opts dim=384 input=Append(-3,0,3)
   relu-batchnorm-layer name=tdnn7 $opts dim=384 input=Append(-3,0,3)
-  relu-batchnorm-layer name=tdnn8 $opts dim=384 input=Append(-6,-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
 
   ## adding the layers for chain branch
   relu-batchnorm-layer name=prefinal-chain $opts dim=384

From 3fecceae3655e0f23a5c0a0b5d86e4b42057bbd9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 8 Jan 2018 16:32:20 -0500
Subject: [PATCH 057/184] [doc] Documentation fix (thx: Denis Peskov)

---
 src/doc/lattices.dox | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/doc/lattices.dox b/src/doc/lattices.dox
index 714d9de6f2e..0b222ec5f1a 100644
--- a/src/doc/lattices.dox
+++ b/src/doc/lattices.dox
@@ -264,8 +264,10 @@ has the same effect as calling that the normal OpenFst RemoveEps() and Determini
 
  \section lattices_generation Lattice generation
 
-Currently, the only decoder that generates lattices is the class
-LatticeSimpleDecoder, defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc.
+Command-line decoding programs that have 'latgen' in their names generate lattices.
+Currently most of these use LatticeFasterDecoder.  For purposes of exposition we will
+focus instead on LatticeSimpleDecoder, whose operation is simpler.
+This is defined in decoder/lattice-simple-decoder.h, and invoked by gmm-latgen-simple.cc.
 As the name suggests, LatticeSimpleDecoder is a lattice-generating decoder that is modified from SimpleDecoder.
 SimpleDecoder is a straightforwardly implemented Viterbi beam search algorithm with only a single
 tunable parameter: the pruning beam (see \ref decoders_simple).  LatticeSimpleDecoder has

From 6e6e6709217ebd9da297eb5dbc8366bae3571459 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 8 Jan 2018 20:43:28 -0500
Subject: [PATCH 058/184] [src] Fix to nnet-utils RE orthonormal-constraint

---
 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh | 340 +++++++++++++++++++
 src/nnet3/nnet-utils.cc                      |  10 +-
 2 files changed, 347 insertions(+), 3 deletions(-)
 create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh

diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
new file mode 100755
index 00000000000..2660adb85d7
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -0,0 +1,340 @@
+#!/bin/bash
+
+# 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the
+# end.
+
+# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1f_sp
+# System                tdnn1e10_sp tdnn1f_sp
+#WER dev93 (tgpr)                7.29      7.20
+#WER dev93 (tg)                  7.08      7.00
+#WER dev93 (big-dict,tgpr)       5.15      5.08
+#WER dev93 (big-dict,fg)         4.52      4.65
+#WER eval92 (tgpr)               5.12      4.93
+#WER eval92 (tg)                 4.91      4.66
+#WER eval92 (big-dict,tgpr)      2.94      2.87
+#WER eval92 (big-dict,fg)        2.57      2.39
+# Final train prob        -0.0545   -0.0512
+# Final valid prob        -0.0650   -0.0641
+# Final train prob (xent)   -0.9696   -0.9105
+# Final valid prob (xent)   -0.9917   -0.9523
+# Num-params                 8067660   6071244
+
+# exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065)
+# exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1f   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005 bottleneck-dim=320"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=448
+  relu-batchnorm-layer name=tdnn2 $opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $opts dim=448
+  relu-batchnorm-layer name=tdnn4 $opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn5 $opts dim=448
+  relu-batchnorm-layer name=tdnn6 $opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn7 $opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn8 $opts dim=448 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=448
+  output-layer name=output $output_opts include-log-softmax=false dim=$num_targets
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent $opts input=tdnn8 dim=448
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/wsj-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 19 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 488a711e09d..dff4cdbee74 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -865,9 +865,13 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
   // Larger alpha will update faster but will be more prone to instability.  I
   // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or
   // it will always be unstable.  It should be > 0.0.
-  // The factor of 1/scale^4 is, I *believe*, going to give us the right
-  // kind of invariance w.r.t. the scale.
-  BaseFloat alpha = 0.125 / (scale * scale * scale * scale);
+  // The factor of 1/scale^2 is, I *believe*, going to give us the right
+  // kind of invariance w.r.t. the scale.  With regard to this factor, look at
+  // the statement
+  // M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0); where P
+  // is proportional to scale^2 and M to 'scale', so the RHS is proportional to
+  // 'scale^3', but we'd like 'M_update' to be proportional to 'scale'.
+  BaseFloat alpha = 0.125 / (scale * scale);
 
   // We're enforcing the rows to be orthonormal.
   // define P = M M^T.  If P is unit then M has orthonormal rows.

From 1b9b9e7b6999a57fbf803e03fed3152ed81823f6 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 12 Jan 2018 00:48:49 -0500
Subject: [PATCH 059/184] [scripts] Add orthonormal-constraint options to
 layers.

---
 egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py | 10 +++++++---
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py         |  8 +++++---
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 0e27e6369e4..c2962de96a7 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -471,6 +471,8 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'dim': -1,
                        'bottleneck-dim': -1,
+                       'orthonormal-constraint': 1.0,
+                            # orthonormal-constraint only matters if bottleneck-dim is set.
                        'include-log-softmax': True,
                             # this would be false for chain models
                        'objective-type': 'linear',
@@ -582,9 +584,11 @@ def _generate_config(self):
 
             # note: by default the LinearComponent uses natural gradient.
             line = ('component name={0}.linear type=LinearComponent '
-                    'orthonormal-constraint=1.0 input-dim={1} output-dim={2} '
-                    'max-change=0.75 {3}'
-                    ''.format(self.name, input_dim, bottleneck_dim, linear_options))
+                    'orthonormal-constraint={1} param-stddev={2} '
+                    'input-dim={3} output-dim={4} max-change=0.75 {5}'
+                    ''.format(self.name, self.config['orthonormal-constraint'],
+                              self.config['orthonormal-constraint'] / math.sqrt(input_dim),
+                              input_dim, bottleneck_dim, linear_options))
             configs.append(line)
             line = ('component-node name={0}.linear component={0}.linear input={1}'
                     ''.format(self.name, cur_node))
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 67537f574e4..131acc254dd 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -829,6 +829,7 @@ def set_default_configs(self):
                         'clipping-threshold': 30.0,
                         'zeroing-interval': 20,
                         'zeroing-threshold': 15.0,
+                        'orthonormal-constraint': 1.0,
                         'delay' : -1,
                         'lstm-nonlinearity-options' : ' max-change=0.75',
                         # the recurrence scale is the scale on m_trunc, used in the
@@ -921,9 +922,10 @@ def _generate_lstm_config(self):
         # constraint, it's meaningless.
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} "
-                       "orthonormal-constraint=1.0 output-dim={2} {3}".format(
-                           name, input_dim + cell_dim, bottleneck_dim,
-                           affine_str))
+                       "orthonormal-constraint={2} output-dim={3} {4}".format(
+                           name, input_dim + cell_dim,
+                           self.config['orthonormal-constraint'],
+                           bottleneck_dim, affine_str))
 
         configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} "
                        "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4,

From 7963c45c72f63c209aaab455e5b8074a64e55c12 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 12 Jan 2018 22:07:12 -0500
Subject: [PATCH 060/184] [src] Bug-fix in nnet3 compilation, RE Scale()
 expressions

---
 src/nnet3/nnet-compile.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/nnet-compile.cc b/src/nnet3/nnet-compile.cc
index 93f35dc8615..9a4559803ad 100644
--- a/src/nnet3/nnet-compile.cc
+++ b/src/nnet3/nnet-compile.cc
@@ -608,6 +608,7 @@ BaseFloat Compiler::SplitByScale(
     }
   }
 
+  int32 num_rows = input_locations_list.size();
   split_locations_lists->resize(alpha_to_nodes.size());
   // `step_to_index` will map from the step-index to the index into
   // `split_locations_lists`; each index is associated with a different value of
@@ -622,6 +623,7 @@ BaseFloat Compiler::SplitByScale(
       BaseFloat alpha = iter->first;
       const std::vector<int32> &nodes = iter->second;
       (*split_locations_lists)[split_locations_index].first = alpha;
+      (*split_locations_lists)[split_locations_index].second.resize(num_rows);
       for (size_t i = 0; i < nodes.size(); i++) {
         int32 node_index = nodes[i];
         KALDI_ASSERT(node_to_steps.count(node_index) != 0);
@@ -638,7 +640,6 @@ BaseFloat Compiler::SplitByScale(
 
   {  // This block populates 'split_locations_lists[*].second' with the
      // split-by-alpha version of 'input_locations_list'
-    int32 num_rows = input_locations_list.size();
     for (int32 r = 0; r < num_rows; r++) {
       const std::vector<std::pair<int32,int32> > &this_list =
           input_locations_list[r];
@@ -856,7 +857,7 @@ void Compiler::CompileBackwardSumDescriptor(
       BaseFloat this_alpha = split_locations_lists[i].first;
       KALDI_ASSERT(this_alpha - this_alpha == 0.0);
       std::vector<std::vector<std::pair<int32, int32> > > submat_locations_list;
-      ComputeValueSubmatLocationsList(split_locations_lists[i].second,
+      ComputeDerivSubmatLocationsList(split_locations_lists[i].second,
                                       &submat_locations_list);
       CompileBackwardFromSubmatLocationsList(deriv_submatrix_index,
                                              this_alpha,

From cbaf7e6c5054d0a1f53d940ee84c684737fa8fe8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 13 Jan 2018 01:11:42 -0500
Subject: [PATCH 061/184] [scripts] Add factorized layer

---
 .../libs/nnet3/xconfig/factorized_layer.py    | 198 ++++++++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py |   1 +
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 +
 3 files changed, 200 insertions(+)
 create mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
new file mode 100644
index 00000000000..16ba460a04e
--- /dev/null
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
@@ -0,0 +1,198 @@
+# Copyright 2017-2018   Johns Hopkins University (Dan Povey)
+#                2016    Vijayaditya Peddinti
+#                2017    Google Inc. (vpeddinti@google.com)
+#                2017    Vimal Manohar
+# Apache 2.0.
+
+""" This module contains layers that just map to a single component.
+"""
+
+from __future__ import print_function
+import math
+import re
+import sys
+from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
+
+
+class XconfigFactorizedLayer(XconfigLayerBase):
+    """This class is for parsing lines like
+     'factorized-layer name=tdnn1 dim=1024 bottleneck-dim=256 bypass-scale=1.0 splicing=-3,0,3'
+
+    This is basically the same as a relu-batchnorm-layer with the bottleneck-dim
+    set, except that it supports the 'bypass-scale' option, which makes the
+    whole thing a bit like a res-block.  You specify the splicing via the 'splicing'
+    option instead of via 'input=xxx', as it needs to use the non-spliced inupt for
+    the bypass.
+
+    Note: the 'dim' is actually optional; it will default to the
+    dimension of the input, and it must be the same as the dimension of the input.
+
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      splicing='0'             [In general can be a comma-separated string describing
+                                the TDNN time-offsets, like '-1,0,1' or '-3,0,3'.
+                                Not specified via 'input', because we need the un-spliced
+                                input so that we can do the] bypass.
+      dim=-1                    [Output dimension of layer, e.g. 1024; must be set.]
+      bottleneck-dim=-1          [Bottleneck dimension, must be set; e.g. 256]
+      self-repair-scale=1.0e-05  [Affects the relu layer]
+      learning-rate-factor=1.0   [This can be used to make the affine component
+                                  train faster or slower].
+      l2-regularize=0.0       [Set this to a nonzero value (e.g. 1.0e-05) to
+                               add l2 regularization on the parameter norm for
+                                this component.
+
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        assert first_token == "factorized-layer"
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+
+        # note: self.config['input'] is a descriptor, '[-1]' means output
+        # the most recent layer.
+        self.config = {'input': '[-1]',
+                       'dim': -1,
+                       'bottleneck-dim': -1,
+                       'self-repair-scale': 1.0e-05,
+                       'target-rms': 1.0,
+                       'splicing': '0',
+                       'bypass-scale': 1.0,
+                       'ng-affine-options': '',
+                       'ng-linear-options': '',
+                       # The following are passed through to components.
+                       'bias-stddev': '',
+                       'l2-regularize': '',
+                       'learning-rate-factor': '',
+                       'max-change': 0.75 }
+
+    def check_configs(self):
+        input_dim = self.descriptors['input']['dim']
+
+        if self.config['dim'] == -1:
+            self.config['dim'] = input_dim
+        elif self.config['dim'] != input_dim:
+            raise RuntimeError("Dimension mismatch: dim={0} vs. input-dim={1}".format(
+                self.config['dim'], input_dim))
+        b = self.config['bottleneck-dim']
+        if b <= 0 or b >= self.config['dim']:
+            raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b))
+
+        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
+            raise RuntimeError("self-repair-scale has invalid value {0}"
+                               .format(self.config['self-repair-scale']))
+        if self.config['target-rms'] < 0.0:
+            raise RuntimeError("target-rms has invalid value {0}"
+                               .format(self.config['target-rms']))
+        if self.config['learning-rate-factor'] <= 0.0:
+            raise RuntimeError("learning-rate-factor has invalid value {0}"
+                               .format(self.config['learning-rate-factor']))
+
+        splicing = self.config['splicing']
+        try:
+            splicing_array = [ int(x) for x in splicing.split(',') ]
+            if not 0 in splicing_array:
+                raise RuntimeError("0 should probably be in the splicing indexes.")
+        except:
+            raise RuntimeError("Invalid option splicing={0}".format(splicing))
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        # return something like: tdnn3.batchnorm
+        return '{0}.batchnorm'.format(self.name)
+
+    def output_dim(self, auxiliary_output=None):
+        output_dim = self.config['dim']
+        # If not set, the output-dim defaults to the input-dim.
+        if output_dim <= 0:
+            self.config['dim'] = self.descriptors['input']['dim']
+        return output_dim
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        bottleneck_dim = self.config['bottleneck-dim']
+        output_dim = input_dim
+        self_repair_scale = self.config['self-repair-scale']
+        target_rms = self.config['target-rms']
+        bypass_scale = self.config['bypass-scale']
+        splicing_array = [ int(x) for x in self.config['splicing'].split(',') ]
+        spliced_input_desc = 'Append({0})'.format(
+            ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset)
+                        for offset in splicing_array ]))
+        # e.g. spliced_input_desc =
+        #   'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))'
+
+        spliced_input_dim = input_dim * len(splicing_array)
+
+        affine_options = self.config['ng-affine-options']
+        for opt_name in [ 'max-change', 'learning-rate-factor',
+                          'bias-stddev', 'l2-regularize' ]:
+            value = self.config[opt_name]
+            if value != '':
+                affine_options += ' {0}={1}'.format(opt_name, value)
+
+        linear_options = self.config['ng-linear-options']
+        for opt_name in [ 'max-change', 'learning-rate-factor' ]:
+            value = self.config[opt_name]
+            if value != '':
+                linear_options += ' {0}={1}'.format(opt_name, value)
+
+        configs = []
+
+        # First the linear component that goes to the bottleneck dim.
+        # note: by default the LinearComponent uses natural gradient.
+        line = ('component name={0}.linear type=LinearComponent '
+                'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}'
+                ''.format(self.name, spliced_input_dim, bottleneck_dim,
+                          linear_options))
+        configs.append(line)
+        line = ('component-node name={0}.linear component={0}.linear input={1}'
+                ''.format(self.name, spliced_input_desc))
+        configs.append(line)
+
+        # Now the affine component
+        line = ('component name={0}.affine type=NaturalGradientAffineComponent'
+                ' input-dim={1} output-dim={2} {3}'
+                ''.format(self.name, bottleneck_dim, output_dim, affine_options))
+        configs.append(line)
+        line = ('component-node name={0}.affine component={0}.affine input={0}.linear'
+                ''.format(self.name))
+        configs.append(line)
+
+        # now the ReLU.  Its input is the output of the affine component plus
+        # the non-sliced input (this is a bit like a res-block).
+        line = ('component name={0}.relu type=RectifiedLinearComponent dim={1}'
+                ' self-repair-scale={2}'
+                ''.format(self.name, output_dim, self_repair_scale))
+        configs.append(line)
+        if bypass_scale != 0.0:
+            line = ('component-node name={0}.relu component={0}.relu '
+                    'input=Sum(Scale({1}, {2}), {0}.affine) '
+                    ''.format(self.name, bypass_scale, input_desc))
+        else:
+            line = ('component-node name={0}.relu component={0}.relu '
+                    'input={0}.affine'.format(self.name))
+        configs.append(line)
+
+        line = ('component name={0}.batchnorm type=BatchNormComponent '
+                'dim={1} target-rms={2}'
+                ''.format(self.name, output_dim, target_rms))
+        configs.append(line)
+        line = ('component-node name={0}.batchnorm component={0}.batchnorm '
+                'input={0}.relu'.format(self.name))
+        configs.append(line)
+
+        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
index 593fbbb745c..d609fb2e685 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -10,3 +10,4 @@
 from gru import *
 from stats_layer import *
 from trivial_layers import *
+from factorized_layer import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index c6b0619bca8..ec9137eadd6 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -68,6 +68,7 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
+        'factorized-layer': xlayers.XconfigFactorizedLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
         'no-op-component': xlayers.XconfigNoOpComponent
 }

From b4f0585800102fe6a0c3b8a767b004694b669417 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 14 Jan 2018 14:43:31 -0500
Subject: [PATCH 062/184] [src] Small bug-fix affecting info output of
 LinearComponent

---
 src/nnet3/nnet-simple-component.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index ea5a2489bc4..b4138cc989e 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3050,7 +3050,7 @@ std::string LinearComponent::Info() const {
   stream << ", use-natural-gradient="
          << (use_natural_gradient_ ? "true" : "false")
          << ", rank-in=" << preconditioner_in_.GetRank()
-         << ", rank-out=" << preconditioner_in_.GetRank()
+         << ", rank-out=" << preconditioner_out_.GetRank()
          << ", num-samples-history="
          << preconditioner_in_.GetNumSamplesHistory()
          << ", update-period=" << preconditioner_in_.GetUpdatePeriod()

From 096a42b59f8c89b4eb444c280b937fb0b8814f3e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 15 Jan 2018 14:28:28 -0500
Subject: [PATCH 063/184] [scripts] Add options to factorized-layer

---
 .../libs/nnet3/xconfig/factorized_layer.py    | 31 +++++++++++++++++--
 1 file changed, 28 insertions(+), 3 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
index 16ba460a04e..e19cf014ab0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
@@ -57,10 +57,14 @@ def set_default_configs(self):
                        'bottleneck-dim': -1,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
+                       'extra-relu': False,
                        'splicing': '0',
                        'bypass-scale': 1.0,
                        'ng-affine-options': '',
                        'ng-linear-options': '',
+                       # if second-matrix-orthonormal, the 2nd matrix
+                       # has the orthonormal constraint.
+                       'second-matrix-orthonormal': False,
                        # The following are passed through to components.
                        'bias-stddev': '',
                        'l2-regularize': '',
@@ -132,6 +136,8 @@ def _generate_config(self):
         spliced_input_desc = 'Append({0})'.format(
             ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset)
                         for offset in splicing_array ]))
+        extra_relu = self.config['extra-relu']
+
         # e.g. spliced_input_desc =
         #   'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))'
 
@@ -150,12 +156,20 @@ def _generate_config(self):
             if value != '':
                 linear_options += ' {0}={1}'.format(opt_name, value)
 
+        if self.config['second-matrix-orthonormal']:
+            # we have to mess with the range of the parameters so they are within
+            # the circle of convergence...
+            affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format(
+                math.sqrt(1.0 / output_dim))
+        else:
+            linear_options += ' orthonormal-constraint=1.0'
+
         configs = []
 
         # First the linear component that goes to the bottleneck dim.
         # note: by default the LinearComponent uses natural gradient.
         line = ('component name={0}.linear type=LinearComponent '
-                'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}'
+                'input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, spliced_input_dim, bottleneck_dim,
                           linear_options))
         configs.append(line)
@@ -163,13 +177,24 @@ def _generate_config(self):
                 ''.format(self.name, spliced_input_desc))
         configs.append(line)
 
+        if extra_relu:
+            # add a relu between the linear and the affine.
+            line = ('component name={0}.relu0 type=RectifiedLinearComponent dim={1}'
+                    ' self-repair-scale={2}'
+                    ''.format(self.name, bottleneck_dim, self_repair_scale))
+            configs.append(line)
+            line = ('component-node name={0}.relu0 component={0}.relu0 '
+                    'input={0}.linear'.format(self.name))
+            configs.append(line)
+
+
         # Now the affine component
         line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, bottleneck_dim, output_dim, affine_options))
         configs.append(line)
-        line = ('component-node name={0}.affine component={0}.affine input={0}.linear'
-                ''.format(self.name))
+        line = ('component-node name={0}.affine component={0}.affine input={0}.{1}'
+                ''.format(self.name, ('relu0' if extra_relu else 'linear')))
         configs.append(line)
 
         # now the ReLU.  Its input is the output of the affine component plus

From 168a642f2b7dff7e2fef0f2ad348288c829738f0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 15 Jan 2018 14:29:43 -0500
Subject: [PATCH 064/184] [src] Allow to apply orthonormal constraint on affine
 component

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 22 +++++++-
 src/nnet3/nnet-simple-component.cc            | 33 +++++++++++-
 src/nnet3/nnet-simple-component.h             | 18 ++++++-
 src/nnet3/nnet-utils.cc                       | 54 ++++++++++++-------
 src/nnet3/nnet-utils.h                        |  3 +-
 5 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index c2962de96a7..a6de5d163c0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -655,7 +655,15 @@ class XconfigBasicLayer(XconfigLayerBase):
 
     Parameters of the class, and their defaults:
       input='[-1]'             [Descriptor giving the input of the layer.]
-      dim=None                   [Output dimension of layer, e.g. 1024]
+      dim=-1                   [Output dimension of layer, e.g. 1024]
+      bottleneck-dim=-1        [If you set this, a linear bottleneck is added, so
+                                we project to first bottleneck-dim then to dim.  One
+                                of the two matrices is constrained to be orthonormal;
+                                see 'second-matrix-orthonormal'.]
+      second-matrix-orthonormal=False   [Only makes a difference if bottleneck-dim>0.
+                                  You can set this to true if you want the orthormal-rows
+                                  constraint to be applied to the 2nd, not the first, of
+                                  the two marices.]
       self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
       learning-rate-factor=1.0   [This can be used to make the affine component
                                   train faster or slower].
@@ -676,6 +684,7 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'dim': -1,
                        'bottleneck-dim': -1,
+                       'second-matrix-orthonormal': False,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
                        'ng-affine-options': '',
@@ -790,10 +799,12 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                 value = self.config[opt_name]
                 if value != '':
                     linear_options += ' {0}={1}'.format(opt_name, value)
+            if not self.config['second-matrix-orthonormal']:
+                linear_options += ' orthonormal-constraint=1.0'
             bottleneck_dim = self.config['bottleneck-dim']
             # note: by default the LinearComponent uses natural gradient.
             line = ('component name={0}.linear type=LinearComponent '
-                    'orthonormal-constraint=1.0 input-dim={1} output-dim={2} {3}'
+                    'input-dim={1} output-dim={2} {3}'
                     ''.format(self.name, input_dim, bottleneck_dim, linear_options))
             configs.append(line)
             line = ('component-node name={0}.linear component={0}.linear input={1}'
@@ -803,6 +814,13 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             cur_dim = bottleneck_dim
 
 
+        if self.config['second-matrix-orthonormal']:
+            assert self.config['bottleneck-dim'] > 0
+            # we have to mess with the range of the parameters so they are within
+            # the circle of convergence...
+            affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format(
+                math.sqrt(1.0 / output_dim))
+
         line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, cur_dim, output_dim, affine_options))
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index b4138cc989e..01adb222372 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1031,13 +1031,15 @@ void AffineComponent::Add(BaseFloat alpha, const Component &other_in) {
 AffineComponent::AffineComponent(const AffineComponent &component):
     UpdatableComponent(component),
     linear_params_(component.linear_params_),
-    bias_params_(component.bias_params_) { }
+    bias_params_(component.bias_params_),
+    orthonormal_constraint_(component.orthonormal_constraint_) { }
 
 AffineComponent::AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
                                  const CuVectorBase<BaseFloat> &bias_params,
                                  BaseFloat learning_rate):
     linear_params_(linear_params),
-    bias_params_(bias_params) {
+    bias_params_(bias_params),
+    orthonormal_constraint_(0.0) {
   SetUnderlyingLearningRate(learning_rate);
   KALDI_ASSERT(linear_params.NumRows() == bias_params.Dim()&&
                bias_params.Dim() != 0);
@@ -1063,6 +1065,8 @@ void AffineComponent::PerturbParams(BaseFloat stddev) {
 std::string AffineComponent::Info() const {
   std::ostringstream stream;
   stream << UpdatableComponent::Info();
+  if (orthonormal_constraint_ != 0.0)
+    stream << ", orthonormal-constraint=" << orthonormal_constraint_;
   PrintParameterStats(stream, "linear-params", linear_params_,
                       false, // include_mean
                       true, // include_row_norms
@@ -1129,6 +1133,8 @@ void AffineComponent::InitFromConfig(ConfigLine *cfl) {
     Init(input_dim, output_dim,
          param_stddev, bias_stddev);
   }
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
   if (cfl->HasUnusedValues())
     KALDI_ERR << "Could not process these elements in initializer: "
               << cfl->UnusedValues();
@@ -1197,6 +1203,12 @@ void AffineComponent::Read(std::istream &is, bool binary) {
     ExpectToken(is, binary, "<IsGradient>");
     ReadBasicType(is, binary, &is_gradient_);
   }
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "</AffineComponent>");
 }
 
@@ -1206,6 +1218,10 @@ void AffineComponent::Write(std::ostream &os, bool binary) const {
   linear_params_.Write(os, binary);
   WriteToken(os, binary, "<BiasParams>");
   bias_params_.Write(os, binary);
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "</AffineComponent>");
 }
 
@@ -2664,6 +2680,12 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &rank_in);
   ExpectToken(is, binary, "<RankOut>");
   ReadBasicType(is, binary, &rank_out);
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OrthonormalConstraint>");
+    ReadBasicType(is, binary, &orthonormal_constraint_);
+  } else {
+    orthonormal_constraint_ = 0.0;
+  }
   ExpectToken(is, binary, "<UpdatePeriod>");
   ReadBasicType(is, binary, &update_period);
   ExpectToken(is, binary, "<NumSamplesHistory>");
@@ -2770,6 +2792,9 @@ void NaturalGradientAffineComponent::InitFromConfig(ConfigLine *cfl) {
     bias_params_.Add(bias_mean);
   }
 
+  orthonormal_constraint_ = 0.0;
+  cfl->GetValue("orthonormal-constraint", &orthonormal_constraint_);
+
   // Set natural-gradient configs.
   BaseFloat num_samples_history = 2000.0,
       alpha = 4.0;
@@ -2807,6 +2832,10 @@ void NaturalGradientAffineComponent::Write(std::ostream &os,
   WriteBasicType(os, binary, preconditioner_in_.GetRank());
   WriteToken(os, binary, "<RankOut>");
   WriteBasicType(os, binary, preconditioner_out_.GetRank());
+  if (orthonormal_constraint_ != 0.0) {
+    WriteToken(os, binary, "<OrthonormalConstraint>");
+    WriteBasicType(os, binary, orthonormal_constraint_);
+  }
   WriteToken(os, binary, "<UpdatePeriod>");
   WriteBasicType(os, binary, preconditioner_in_.GetUpdatePeriod());
   WriteToken(os, binary, "<NumSamplesHistory>");
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index f596ec6be75..b1eb30a55bf 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -390,10 +390,11 @@ class AffineComponent: public UpdatableComponent {
   virtual int32 InputDim() const { return linear_params_.NumCols(); }
   virtual int32 OutputDim() const { return linear_params_.NumRows(); }
 
+  BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; }
   virtual std::string Info() const;
   virtual void InitFromConfig(ConfigLine *cfl);
 
-  AffineComponent() { } // use Init to really initialize.
+  AffineComponent(): orthonormal_constraint_(0.0) { } // use Init to really initialize.
   virtual std::string Type() const { return "AffineComponent"; }
   virtual int32 Properties() const {
     return kSimpleComponent|kUpdatableComponent|
@@ -434,6 +435,7 @@ class AffineComponent: public UpdatableComponent {
                          const CuMatrixBase<BaseFloat> &linear);
   const CuVector<BaseFloat> &BiasParams() const { return bias_params_; }
   const CuMatrix<BaseFloat> &LinearParams() const { return linear_params_; }
+  CuMatrix<BaseFloat> &LinearParams() { return linear_params_; }
   explicit AffineComponent(const AffineComponent &other);
   // The next constructor is used in converting from nnet1.
   AffineComponent(const CuMatrixBase<BaseFloat> &linear_params,
@@ -466,6 +468,7 @@ class AffineComponent: public UpdatableComponent {
   const AffineComponent &operator = (const AffineComponent &other); // Disallow.
   CuMatrix<BaseFloat> linear_params_;
   CuVector<BaseFloat> bias_params_;
+  BaseFloat orthonormal_constraint_;
 };
 
 class RepeatedAffineComponent;
@@ -755,6 +758,19 @@ class LogSoftmaxComponent: public NonlinearComponent {
                            Dimension is output-dim by (input-dim + 1), last
                            column is interpreted as the bias.
 
+   Other options:
+    orthonormal-constraint=0.0   If you set this to 1.0, then
+                           the linear_params_ matrix will be (approximately)
+                           constrained during training to have orthonormal rows
+                           (or columns, whichever is fewer).  You can choose a
+                           positive nonzero value different than 1.0 to have a
+                           scaled orthonormal matrix, i.e. with singular values
+                           at the selected value (e.g. 0.5, or 2.0).  This is
+                           not enforced inside the component itself; you have to
+                           call ConstrainOrthonormal() from the training code to
+                           do this.  All this component does is return the
+                           OrthonormalConstraint() value.
+
    Options to the natural gradient (you won't normally have to set these,
    the defaults are suitable):
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index dff4cdbee74..cc5762474d6 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -907,30 +907,48 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
 /**
    This function, to be called after processing every minibatch, is responsible
    for enforcing the orthogonality constraint for any components of type
-   LinearComponent that have the "orthonormal_constraint" value set.
+   LinearComponent or inheriting from AffineComponent that have the
+   "orthonormal_constraint" value set.
  */
 void ConstrainOrthonormal(Nnet *nnet) {
+
   for (int32 c = 0; c < nnet->NumComponents(); c++) {
     Component *component = nnet->GetComponent(c);
     LinearComponent *lc = dynamic_cast<LinearComponent*>(component);
-    if (lc == NULL || lc->OrthonormalConstraint() == 0.0)
-      continue;
-    if (RandInt(0, 3) != 0)
-      continue;  // For efficiency, only do this every 4 minibatches-- it won't
-                 // stray far.
-
-
-    BaseFloat scale = lc->OrthonormalConstraint();
-    KALDI_ASSERT(scale > 0.0);
+    if (lc != NULL && lc->OrthonormalConstraint() != 0.0) {
+      if (RandInt(0, 3) != 0)
+        continue;  // For efficiency, only do this every 4 minibatches-- it won't
+                   // stray far.
+      BaseFloat scale = lc->OrthonormalConstraint();
+      KALDI_ASSERT(scale > 0.0);
+
+      CuMatrixBase<BaseFloat> &params = lc->Params();
+      int32 rows = params.NumRows(), cols = params.NumCols();
+      if (rows <= cols) {
+        ConstrainOrthonormalInternal(scale, &params);
+      } else {
+        CuMatrix<BaseFloat> params_trans(params, kTrans);
+        ConstrainOrthonormalInternal(scale, &params_trans);
+        params.CopyFromMat(params_trans, kTrans);
+      }
+    }
 
-    CuMatrixBase<BaseFloat> &params = lc->Params();
-    int32 rows = params.NumRows(), cols = params.NumCols();
-    if (rows <= cols) {
-      ConstrainOrthonormalInternal(scale, &params);
-    } else {
-      CuMatrix<BaseFloat> params_trans(params, kTrans);
-      ConstrainOrthonormalInternal(scale, &params_trans);
-      params.CopyFromMat(params_trans, kTrans);
+    AffineComponent *ac = dynamic_cast<AffineComponent*>(component);
+    if (ac != NULL && ac->OrthonormalConstraint() != 0.0) {
+      if (RandInt(0, 3) != 0)
+        continue;  // For efficiency, only do this every 4 minibatches-- it won't
+                   // stray far.
+      BaseFloat scale = ac->OrthonormalConstraint();
+      KALDI_ASSERT(scale > 0.0);
+      CuMatrixBase<BaseFloat> &params = ac->LinearParams();
+      int32 rows = params.NumRows(), cols = params.NumCols();
+      if (rows <= cols) {
+        ConstrainOrthonormalInternal(scale, &params);
+      } else {
+        CuMatrix<BaseFloat> params_trans(params, kTrans);
+        ConstrainOrthonormalInternal(scale, &params_trans);
+        params.CopyFromMat(params_trans, kTrans);
+      }
     }
   }
 }
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index fc1631a8d77..efa36e1f64c 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -453,7 +453,8 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale,
 /**
    This function, to be called after processing every minibatch, is responsible
    for enforcing the orthogonality constraint for any components of type
-   LinearComponent that have the "orthonormal-constraint" value set to nonzero.
+   LinearComponent or inheriting from AffineComponent that have the
+   "orthonormal-constraint" value set to nonzero.
 
    In order to make it efficient on GPU, it doesn't make it completely orthonormal,
    it just makes it closer to being orthonormal (times the 'orthonormal_constraint'

From e3ea3c8e0928113a9486d2d97ced0bb1ed053631 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 15 Jan 2018 14:33:11 -0500
Subject: [PATCH 065/184] [scripts] update chain_dir_info.pl to handle no chain
 l2-regularize

---
 egs/wsj/s5/steps/info/chain_dir_info.pl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/info/chain_dir_info.pl b/egs/wsj/s5/steps/info/chain_dir_info.pl
index d0fac5292c6..cda271f9724 100755
--- a/egs/wsj/s5/steps/info/chain_dir_info.pl
+++ b/egs/wsj/s5/steps/info/chain_dir_info.pl
@@ -139,7 +139,7 @@ sub get_combine_info {
         return sprintf(" combine=%.3f->%.3f", $1, $2);
       } elsif (m/Combining (\S+) nnets, objective function changed from (\S+) to (\S+)/) {
         close(F);
-        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1); 
+        return sprintf(" combine=%.3f->%.3f (over %d)", $2, $3, $1);
       }
     }
   }
@@ -204,6 +204,9 @@ sub get_logprob_and_accuracy_info {
         if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
           $iter_to_train_logprob{$iter} = $1;
           $iter_to_train_penalty{$iter} = $2;
+        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
+          $iter_to_train_logprob{$iter} = $1;
+          $iter_to_train_penalty{$iter} = 0.0;
         } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
           $iter_to_train_xent{$iter} = $1;
         }
@@ -213,6 +216,9 @@ sub get_logprob_and_accuracy_info {
         if (m/Overall log-probability for 'output' is (\S+) \+ (\S+)/) {
           $iter_to_valid_logprob{$iter} = $1;
           $iter_to_valid_penalty{$iter} = $2;
+        } elsif (m/Overall log-probability for 'output' is (\S+)/) {
+          $iter_to_valid_logprob{$iter} = $1;
+          $iter_to_valid_penalty{$iter} = 0.0;
         } elsif (m/Overall log-probability for 'output-xent' is (\S+) per frame/) {
           $iter_to_valid_xent{$iter} = $1;
         }

From d2a1485a4a621cc23e53a1587b495090d4b47abc Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 15 Jan 2018 14:36:05 -0500
Subject: [PATCH 066/184] [scripts] Remove factorized-layer

---
 .../libs/nnet3/xconfig/factorized_layer.py    | 223 ------------------
 egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py |   1 -
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   1 -
 3 files changed, 225 deletions(-)
 delete mode 100644 egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
deleted file mode 100644
index e19cf014ab0..00000000000
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/factorized_layer.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright 2017-2018   Johns Hopkins University (Dan Povey)
-#                2016    Vijayaditya Peddinti
-#                2017    Google Inc. (vpeddinti@google.com)
-#                2017    Vimal Manohar
-# Apache 2.0.
-
-""" This module contains layers that just map to a single component.
-"""
-
-from __future__ import print_function
-import math
-import re
-import sys
-from libs.nnet3.xconfig.basic_layers import XconfigLayerBase
-
-
-class XconfigFactorizedLayer(XconfigLayerBase):
-    """This class is for parsing lines like
-     'factorized-layer name=tdnn1 dim=1024 bottleneck-dim=256 bypass-scale=1.0 splicing=-3,0,3'
-
-    This is basically the same as a relu-batchnorm-layer with the bottleneck-dim
-    set, except that it supports the 'bypass-scale' option, which makes the
-    whole thing a bit like a res-block.  You specify the splicing via the 'splicing'
-    option instead of via 'input=xxx', as it needs to use the non-spliced inupt for
-    the bypass.
-
-    Note: the 'dim' is actually optional; it will default to the
-    dimension of the input, and it must be the same as the dimension of the input.
-
-
-    Parameters of the class, and their defaults:
-      input='[-1]'             [Descriptor giving the input of the layer.]
-      splicing='0'             [In general can be a comma-separated string describing
-                                the TDNN time-offsets, like '-1,0,1' or '-3,0,3'.
-                                Not specified via 'input', because we need the un-spliced
-                                input so that we can do the] bypass.
-      dim=-1                    [Output dimension of layer, e.g. 1024; must be set.]
-      bottleneck-dim=-1          [Bottleneck dimension, must be set; e.g. 256]
-      self-repair-scale=1.0e-05  [Affects the relu layer]
-      learning-rate-factor=1.0   [This can be used to make the affine component
-                                  train faster or slower].
-      l2-regularize=0.0       [Set this to a nonzero value (e.g. 1.0e-05) to
-                               add l2 regularization on the parameter norm for
-                                this component.
-
-    """
-    def __init__(self, first_token, key_to_value, prev_names=None):
-        assert first_token == "factorized-layer"
-        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
-
-    def set_default_configs(self):
-
-        # note: self.config['input'] is a descriptor, '[-1]' means output
-        # the most recent layer.
-        self.config = {'input': '[-1]',
-                       'dim': -1,
-                       'bottleneck-dim': -1,
-                       'self-repair-scale': 1.0e-05,
-                       'target-rms': 1.0,
-                       'extra-relu': False,
-                       'splicing': '0',
-                       'bypass-scale': 1.0,
-                       'ng-affine-options': '',
-                       'ng-linear-options': '',
-                       # if second-matrix-orthonormal, the 2nd matrix
-                       # has the orthonormal constraint.
-                       'second-matrix-orthonormal': False,
-                       # The following are passed through to components.
-                       'bias-stddev': '',
-                       'l2-regularize': '',
-                       'learning-rate-factor': '',
-                       'max-change': 0.75 }
-
-    def check_configs(self):
-        input_dim = self.descriptors['input']['dim']
-
-        if self.config['dim'] == -1:
-            self.config['dim'] = input_dim
-        elif self.config['dim'] != input_dim:
-            raise RuntimeError("Dimension mismatch: dim={0} vs. input-dim={1}".format(
-                self.config['dim'], input_dim))
-        b = self.config['bottleneck-dim']
-        if b <= 0 or b >= self.config['dim']:
-            raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b))
-
-        if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0:
-            raise RuntimeError("self-repair-scale has invalid value {0}"
-                               .format(self.config['self-repair-scale']))
-        if self.config['target-rms'] < 0.0:
-            raise RuntimeError("target-rms has invalid value {0}"
-                               .format(self.config['target-rms']))
-        if self.config['learning-rate-factor'] <= 0.0:
-            raise RuntimeError("learning-rate-factor has invalid value {0}"
-                               .format(self.config['learning-rate-factor']))
-
-        splicing = self.config['splicing']
-        try:
-            splicing_array = [ int(x) for x in splicing.split(',') ]
-            if not 0 in splicing_array:
-                raise RuntimeError("0 should probably be in the splicing indexes.")
-        except:
-            raise RuntimeError("Invalid option splicing={0}".format(splicing))
-
-    def output_name(self, auxiliary_output=None):
-        assert auxiliary_output is None
-        # return something like: tdnn3.batchnorm
-        return '{0}.batchnorm'.format(self.name)
-
-    def output_dim(self, auxiliary_output=None):
-        output_dim = self.config['dim']
-        # If not set, the output-dim defaults to the input-dim.
-        if output_dim <= 0:
-            self.config['dim'] = self.descriptors['input']['dim']
-        return output_dim
-
-    def get_full_config(self):
-        ans = []
-        config_lines = self._generate_config()
-
-        for line in config_lines:
-            for config_name in ['ref', 'final']:
-                # we do not support user specified matrices in this layer
-                # so 'ref' and 'final' configs are the same.
-                ans.append((config_name, line))
-        return ans
-
-    def _generate_config(self):
-        input_desc = self.descriptors['input']['final-string']
-        input_dim = self.descriptors['input']['dim']
-        bottleneck_dim = self.config['bottleneck-dim']
-        output_dim = input_dim
-        self_repair_scale = self.config['self-repair-scale']
-        target_rms = self.config['target-rms']
-        bypass_scale = self.config['bypass-scale']
-        splicing_array = [ int(x) for x in self.config['splicing'].split(',') ]
-        spliced_input_desc = 'Append({0})'.format(
-            ', '.join([ 'Offset({0}, {1})'.format(input_desc, offset)
-                        for offset in splicing_array ]))
-        extra_relu = self.config['extra-relu']
-
-        # e.g. spliced_input_desc =
-        #   'Append(Offset(tdnn2, -1), Offset(tdnn2, 0), Offset(tdnn2, 1))'
-
-        spliced_input_dim = input_dim * len(splicing_array)
-
-        affine_options = self.config['ng-affine-options']
-        for opt_name in [ 'max-change', 'learning-rate-factor',
-                          'bias-stddev', 'l2-regularize' ]:
-            value = self.config[opt_name]
-            if value != '':
-                affine_options += ' {0}={1}'.format(opt_name, value)
-
-        linear_options = self.config['ng-linear-options']
-        for opt_name in [ 'max-change', 'learning-rate-factor' ]:
-            value = self.config[opt_name]
-            if value != '':
-                linear_options += ' {0}={1}'.format(opt_name, value)
-
-        if self.config['second-matrix-orthonormal']:
-            # we have to mess with the range of the parameters so they are within
-            # the circle of convergence...
-            affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format(
-                math.sqrt(1.0 / output_dim))
-        else:
-            linear_options += ' orthonormal-constraint=1.0'
-
-        configs = []
-
-        # First the linear component that goes to the bottleneck dim.
-        # note: by default the LinearComponent uses natural gradient.
-        line = ('component name={0}.linear type=LinearComponent '
-                'input-dim={1} output-dim={2} {3}'
-                ''.format(self.name, spliced_input_dim, bottleneck_dim,
-                          linear_options))
-        configs.append(line)
-        line = ('component-node name={0}.linear component={0}.linear input={1}'
-                ''.format(self.name, spliced_input_desc))
-        configs.append(line)
-
-        if extra_relu:
-            # add a relu between the linear and the affine.
-            line = ('component name={0}.relu0 type=RectifiedLinearComponent dim={1}'
-                    ' self-repair-scale={2}'
-                    ''.format(self.name, bottleneck_dim, self_repair_scale))
-            configs.append(line)
-            line = ('component-node name={0}.relu0 component={0}.relu0 '
-                    'input={0}.linear'.format(self.name))
-            configs.append(line)
-
-
-        # Now the affine component
-        line = ('component name={0}.affine type=NaturalGradientAffineComponent'
-                ' input-dim={1} output-dim={2} {3}'
-                ''.format(self.name, bottleneck_dim, output_dim, affine_options))
-        configs.append(line)
-        line = ('component-node name={0}.affine component={0}.affine input={0}.{1}'
-                ''.format(self.name, ('relu0' if extra_relu else 'linear')))
-        configs.append(line)
-
-        # now the ReLU.  Its input is the output of the affine component plus
-        # the non-sliced input (this is a bit like a res-block).
-        line = ('component name={0}.relu type=RectifiedLinearComponent dim={1}'
-                ' self-repair-scale={2}'
-                ''.format(self.name, output_dim, self_repair_scale))
-        configs.append(line)
-        if bypass_scale != 0.0:
-            line = ('component-node name={0}.relu component={0}.relu '
-                    'input=Sum(Scale({1}, {2}), {0}.affine) '
-                    ''.format(self.name, bypass_scale, input_desc))
-        else:
-            line = ('component-node name={0}.relu component={0}.relu '
-                    'input={0}.affine'.format(self.name))
-        configs.append(line)
-
-        line = ('component name={0}.batchnorm type=BatchNormComponent '
-                'dim={1} target-rms={2}'
-                ''.format(self.name, output_dim, target_rms))
-        configs.append(line)
-        line = ('component-node name={0}.batchnorm component={0}.batchnorm '
-                'input={0}.relu'.format(self.name))
-        configs.append(line)
-
-        return configs
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
index d609fb2e685..593fbbb745c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/layers.py
@@ -10,4 +10,3 @@
 from gru import *
 from stats_layer import *
 from trivial_layers import *
-from factorized_layer import *
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index ec9137eadd6..c6b0619bca8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -68,7 +68,6 @@
         'opgru-layer' : xlayers.XconfigOpgruLayer,
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
-        'factorized-layer': xlayers.XconfigFactorizedLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
         'no-op-component': xlayers.XconfigNoOpComponent
 }

From 99137449b70cc6cbc3f260b213750b977d95ee84 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 16 Jan 2018 13:29:12 -0500
Subject: [PATCH 067/184] [egs] update swbd compare_wer_general.sh to include
 rt03

---
 .../s5c/local/chain/compare_wer_general.sh    | 56 +++++++++++++++++--
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index c4c3d182bfe..fcd66d5d78d 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -8,10 +8,18 @@
 echo "# $0 $*";  # print command line.
 
 include_looped=false
-if [ "$1" == "--looped" ]; then
-  include_looped=true
-  shift
-fi
+include_rt03=false
+
+for x in $(seq 3); do
+  if [ "$1" == "--looped" ]; then
+    include_looped=true
+    shift
+  fi
+  if [ "$1" == "--rt03" ]; then
+    include_rt03=true
+    shift
+  fi
+done
 
 echo -n "# System               "
 for x in $*; do   printf " % 9s" $x;   done
@@ -120,6 +128,46 @@ if $include_looped; then
 fi
 
 
+if $include_rt03; then
+  echo -n "# WER on rt03(tg)      "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+
+  if $include_looped; then
+    echo -n "#           [looped:]  "
+    for x in $*; do
+      set_names $x
+      wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+      printf "% 10s" $wer
+    done
+    echo
+  fi
+
+  echo -n "# WER on rt03(fg)      "
+  for x in $*; do
+    set_names $x
+    wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    printf "% 10s" $wer
+  done
+  echo
+
+  if $include_looped; then
+    echo -n "#           [looped:]  "
+    for x in $*; do
+      set_names $x
+      wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+      printf "% 10s" $wer
+    done
+    echo
+  fi
+fi
+
+
+
 if $used_epochs; then
   # we don't print the probs in this case.
   exit 0

From 69359940ff26654b2c6fc7f94a22ed2b7f4af1a3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 16 Jan 2018 19:04:17 -0500
Subject: [PATCH 068/184] [src,egs] Somer nnet3 fixes that shouldn't affect
 anything; adding some example scripts.

---
 .../s5c/local/chain/tuning/run_tdnn_7m.sh     |   5 +-
 .../s5c/local/chain/tuning/run_tdnn_7m19c.sh  | 383 ++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    |   7 +-
 egs/wsj/s5/local/chain/run_tdnn.sh            |   2 +-
 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh  |  32 +-
 .../local/chain/tuning/run_tdnn_lstm_1b22c.sh | 680 ++++++++++++++++++
 .../internal/resolve_ctm_edits_overlaps.py    |   2 +-
 src/nnet3/nnet-analyze.cc                     |  20 +-
 src/nnet3/nnet-analyze.h                      |   9 +-
 src/nnet3/nnet-optimize-utils.cc              |  13 +-
 src/nnet3/nnet-optimize.cc                    |   4 +-
 11 files changed, 1125 insertions(+), 32 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
 create mode 100755 egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
index 552e944c05a..03b1ee3c97f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m.sh
@@ -31,6 +31,7 @@ speed_perturb=true
 dir=exp/chain/tdnn_7m  # Note: _sp will get added to this if $speed_perturb == true.
 decode_iter=
 decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 # training options
 num_epochs=4
@@ -214,7 +215,7 @@ if [ ! -z $decode_iter ]; then
 fi
 if [ $stage -le 15 ]; then
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
           --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
@@ -243,7 +244,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then
        $lang exp/nnet3/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
     (
       # note: we just give it "$decode_set" as it only uses the wav.scp, the
       # feature type does not matter.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
new file mode 100755
index 00000000000..8cc029744a1
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
@@ -0,0 +1,383 @@
+#!/bin/bash
+# Note: before merging to master, this will be renamed.
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m19c
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1280 bottleneck-dim=256
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1280 bottleneck-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1280 bottleneck-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index 1d566290163..4b2c93082d9 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -6,7 +6,7 @@
 # After comparing different combinations of dropout(with or without) and decay-time
 # option(20, 40 or without), we found this setup is best.
 
-#System                   tdnn_lstm_1l_ld5  tdnn_lstm_1m_ld   1m_online 
+#System                   tdnn_lstm_1l_ld5  tdnn_lstm_1m_ld   1m_online
 #WER on train_dev(tg)         12.41             12.37           12.21
 #WER on train_dev(fg)         11.59             11.46           11.41
 #WER on eval2000(tg)          14.8              14.8            14.9
@@ -30,6 +30,7 @@ dir=exp/chain/tdnn_lstm_1m # Note: _sp will get added to this if $speed_perturb
 decode_iter=
 decode_dir_affix=
 decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 # training options
 leftmost_questions_truncate=-1
@@ -227,7 +228,7 @@ if [ $stage -le 15 ]; then
   if [ ! -z $decode_iter ]; then
     iter_opts=" --iter $decode_iter "
   fi
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
       (
        steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
           --nj 50 --cmd "$decode_cmd" $iter_opts \
@@ -257,7 +258,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then
        $lang exp/nnet3/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000; do
+  for decode_set in train_dev eval2000 rt03; do
     (
       # note: we just give it "$decode_set" as it only uses the wav.scp, the
       # feature type does not matter.
diff --git a/egs/wsj/s5/local/chain/run_tdnn.sh b/egs/wsj/s5/local/chain/run_tdnn.sh
index 75da1a0a553..cb5756188a4 120000
--- a/egs/wsj/s5/local/chain/run_tdnn.sh
+++ b/egs/wsj/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1e.sh
\ No newline at end of file
+tuning/run_tdnn_1f.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index 2660adb85d7..be8d39de80b 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -1,24 +1,26 @@
 #!/bin/bash
 
 # 1f is as 1e but a re-tuned model with fewer parameters and a bottleneck at the
-# end.
+# end, and no chain l2-regularize
+#[note: was 1e12e.]
 
-# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1f_sp
-# System                tdnn1e10_sp tdnn1f_sp
+# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp
+# System                tdnn1e10_sp tdnn1e12e_sp
 #WER dev93 (tgpr)                7.29      7.20
-#WER dev93 (tg)                  7.08      7.00
-#WER dev93 (big-dict,tgpr)       5.15      5.08
-#WER dev93 (big-dict,fg)         4.52      4.65
-#WER eval92 (tgpr)               5.12      4.93
-#WER eval92 (tg)                 4.91      4.66
-#WER eval92 (big-dict,tgpr)      2.94      2.87
-#WER eval92 (big-dict,fg)        2.57      2.39
-# Final train prob        -0.0545   -0.0512
-# Final valid prob        -0.0650   -0.0641
-# Final train prob (xent)   -0.9696   -0.9105
-# Final valid prob (xent)   -0.9917   -0.9523
+#WER dev93 (tg)                  7.08      6.81
+#WER dev93 (big-dict,tgpr)       5.15      5.04
+#WER dev93 (big-dict,fg)         4.52      4.42
+#WER eval92 (tgpr)               5.12      4.80
+#WER eval92 (tg)                 4.91      4.54
+#WER eval92 (big-dict,tgpr)      2.94      2.76
+#WER eval92 (big-dict,fg)        2.57      2.30
+# Final train prob        -0.0545   -0.0455
+# Final valid prob        -0.0650   -0.0599
+# Final train prob (xent)   -0.9696   -0.9060
+# Final valid prob (xent)   -0.9917   -0.9448
 # Num-params                 8067660   6071244
 
+
 # exp/chain/tdnn1e_sp: num-iters=72 nj=2..8 num-params=8.1M dim=40+100->2854 combine=-0.064->-0.063 (over 3) xent:train/valid[47,71,final]=(-1.07,-0.973,-0.970/-1.08,-0.992,-0.992) logprob:train/valid[47,71,final]=(-0.064,-0.056,-0.054/-0.072,-0.066,-0.065)
 # exp/chain/tdnn1f_sp: num-iters=72 nj=2..8 num-params=6.1M dim=40+100->2854 combine=-0.061->-0.061 (over 2) xent:train/valid[47,71,final]=(-1.04,-0.911,-0.910/-1.06,-0.953,-0.952) logprob:train/valid[47,71,final]=(-0.063,-0.052,-0.051/-0.071,-0.064,-0.064)
 
@@ -216,7 +218,7 @@ if [ $stage -le 16 ]; then
     --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \
     --chain.leaky-hmm-coefficient=0.1 \
-    --chain.l2-regularize=0.00005 \
+    --chain.l2-regularize=0.0 \
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --trainer.srand=$srand \
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh
new file mode 100755
index 00000000000..0e5ba084f71
--- /dev/null
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh
@@ -0,0 +1,680 @@
+#!/bin/bash
+#TODO: this needs to be renamed and the comments changed, before merging to master.
+
+# 1b22c is as 1b22 but setting label delay to 8.  It improves on average, even
+# if not everywhere.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b21_sp exp/chain/tdnn_lstm1b22_sp exp/chain/tdnn_lstm1b22c_sp
+# System                tdnn_lstm1a_sp tdnn_lstm1b21_sp tdnn_lstm1b22_sp tdnn_lstm1b22c_sp
+#WER dev93 (tgpr)                7.64      7.69      7.47      7.24
+#WER dev93 (tg)                  7.29      7.27      7.14      7.03
+#WER dev93 (big-dict,tgpr)       5.53      5.42      5.31      5.04
+#WER dev93 (big-dict,fg)         5.14      5.04      5.00      4.92
+#WER eval92 (tgpr)               5.62      5.19      5.14      5.23
+#WER eval92 (tg)                 5.30      5.00      4.93      4.78
+#WER eval92 (big-dict,tgpr)      3.62      3.24      3.12      3.17
+#WER eval92 (big-dict,fg)        3.31      2.96      2.73      2.73
+# Final train prob        -0.0344   -0.0470   -0.0401   -0.0403
+# Final valid prob        -0.0518   -0.0587   -0.0527   -0.0526
+# Final train prob (xent)   -0.5589   -0.7782   -0.7484   -0.7406
+# Final valid prob (xent)   -0.6620   -0.8210   -0.7865   -0.7766
+# Num-params                 9106252   4216524   4216524   4216524
+
+# 1b22 is as 1b21 but setting chain.l2-regularize to zero.
+
+# 1b21 is as 1b20 but half the learning rate..
+
+# 1b20 is as 1b19b but reducing dimensions of TDNN layers from 512 to 448.
+# 1b19b is as 1b19 but with more epochs (4->6)
+# 1b19 is a rerun of 1b18d3 (a fairly small LSTM+TDNN setup).
+#
+#
+# 1b18d3 is as 1b18d2 but reducing lstm bottleneck dim from 304 to 256.
+# [1b18d2 is just a rerun of 1b18d as I merged various code changes and
+#  I want to make sure nothing bad happened.]
+#
+# Results below show it's probably slightly better than the average of 18d and 18d2
+#   (which are supposed to be the same experiment)...
+#
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18d_sp exp/chain/tdnn_lstm1b18d2_sp exp/chain/tdnn_lstm1b18d3_sp
+# System                tdnn_lstm1b18d_sp tdnn_lstm1b18d2_sp tdnn_lstm1b18d3_sp
+#WER dev93 (tgpr)                7.78      7.46      7.46
+#WER dev93 (tg)                  7.29      7.30      7.04
+#WER dev93 (big-dict,tgpr)       5.56      5.51      5.55
+#WER dev93 (big-dict,fg)         5.32      5.08      5.05
+#WER eval92 (tgpr)               5.33      5.40      5.39
+#WER eval92 (tg)                 5.05      5.03      4.96
+#WER eval92 (big-dict,tgpr)      3.42      3.26      3.35
+#WER eval92 (big-dict,fg)        2.91      2.64      2.82
+# Final train prob        -0.0529   -0.0536   -0.0543
+# Final valid prob        -0.0633   -0.0630   -0.0636
+# Final train prob (xent)   -0.8327   -0.8330   -0.8415
+# Final valid prob (xent)   -0.8693   -0.8672   -0.8695
+# Num-params                 4922060   4922060   4805324
+
+#
+# 1b18d is as 1b18c, but adding 'self-scale=2.0' to scale up the m_trunc when it is given
+# as input to the affine projections (I found previously this was helpful).
+# .. Interesting: objf improves but WER is not better.
+#
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b18c_sp exp/chain/tdnn_lstm1b18d_sp
+# System                tdnn_lstm1b18c_sp tdnn_lstm1b18d_sp
+#WER dev93 (tgpr)                7.77      7.78
+#WER dev93 (tg)                  7.40      7.29
+#WER dev93 (big-dict,tgpr)       5.39      5.56
+#WER dev93 (big-dict,fg)         5.25      5.32
+#WER eval92 (tgpr)               5.48      5.33
+#WER eval92 (tg)                 4.98      5.05
+#WER eval92 (big-dict,tgpr)      3.07      3.42
+#WER eval92 (big-dict,fg)        2.69      2.91
+# Final train prob        -0.0546   -0.0529
+# Final valid prob        -0.0641   -0.0633
+# Final train prob (xent)   -0.8679   -0.8327
+# Final valid prob (xent)   -0.8954   -0.8693
+# Num-params                 4922060   4922060
+
+# 1b18c is as 1b18b, but fixing a bug in the script whereby c instead of m had been used
+# as input to the affine projections.
+
+# 1b18b is as 1b18, but doubling l2 regularization on the output
+#  and lstm layers, parts of them were training too slowly.
+#
+# 1b18 is as 1b17, but via script change, not using memory-norm (actually
+#   this is the same as 1b17d).
+#  I don't see any WER change, but objf is worse.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b17_sp exp/chain/tdnn_lstm1b17d_sp exp/chain/tdnn_lstm1b18_sp
+# System                tdnn_lstm1b17_sp tdnn_lstm1b17d_sp tdnn_lstm1b18_sp
+#WER dev93 (tgpr)                7.49      7.44      7.48
+#WER dev93 (tg)                  7.18      7.13      7.19
+#WER dev93 (big-dict,tgpr)       5.50      5.34      5.48
+#WER dev93 (big-dict,fg)         5.11      5.15      5.04
+#WER eval92 (tgpr)               5.26      5.32      5.32
+#WER eval92 (tg)                 5.00      4.94      5.03
+#WER eval92 (big-dict,tgpr)      3.24      3.28      3.26
+#WER eval92 (big-dict,fg)        2.82      2.80      2.84
+# Final train prob        -0.0489   -0.0486   -0.0496
+# Final valid prob        -0.0583   -0.0599   -0.0612
+# Final train prob (xent)   -0.7550   -0.7809   -0.7749
+# Final valid prob (xent)   -0.7988   -0.8121   -0.8131
+# Num-params                 4922060   4922060   4922060
+
+# 1b17 is as 1b13m, it's just a rerun after some code changes (adding
+# diagonal natural gradient stuff) which should make no difference.
+# Still seems to be working.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp exp/chain/tdnn_lstm1b17_sp
+# System                tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp tdnn_lstm1b17_sp
+#WER dev93 (tgpr)                7.86      7.43      7.49
+#WER dev93 (tg)                  7.40      7.00      7.18
+#WER dev93 (big-dict,tgpr)       5.65      5.21      5.50
+#WER dev93 (big-dict,fg)         5.11      4.76      5.11
+#WER eval92 (tgpr)               5.64      5.39      5.26
+#WER eval92 (tg)                 5.17      5.00      5.00
+#WER eval92 (big-dict,tgpr)      3.21      3.30      3.24
+#WER eval92 (big-dict,fg)        2.84      2.62      2.82
+# Final train prob        -0.0469   -0.0516   -0.0489
+# Final valid prob        -0.0601   -0.0607   -0.0583
+# Final train prob (xent)   -0.7424   -0.7593   -0.7550
+# Final valid prob (xent)   -0.7920   -0.7982   -0.7988
+# Num-params                 5456076   4922060   4922060
+
+# 1b13m is as 1b13l, but reverting the LSTM script "fix" (which actually
+#  made things worse), so the baseline is 1b13{c,d} (and the change versus
+# c,d is to add bottleneck-dim=256).
+#
+# It's helpful:
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp exp/chain/tdnn_lstm1b13m_sp
+# System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp tdnn_lstm1b13m_sp
+#WER dev93 (tgpr)                7.68      7.86      7.43
+#WER dev93 (tg)                  7.34      7.40      7.00
+#WER dev93 (big-dict,tgpr)       5.42      5.65      5.21
+#WER dev93 (big-dict,fg)         5.05      5.11      4.76
+#WER eval92 (tgpr)               5.48      5.64      5.39
+#WER eval92 (tg)                 5.26      5.17      5.00
+#WER eval92 (big-dict,tgpr)      3.23      3.21      3.30
+#WER eval92 (big-dict,fg)        2.82      2.84      2.62
+# Final train prob        -0.0490   -0.0469   -0.0516
+# Final valid prob        -0.0597   -0.0601   -0.0607
+# Final train prob (xent)   -0.7549   -0.7424   -0.7593
+# Final valid prob (xent)   -0.7910   -0.7920   -0.7982
+# Num-params                 5456076   5456076   4922060
+#
+#
+# 1b13l is as 1b13k, but adding bottleneck-dim=256 to the output layers.
+#  Definitely helpful:
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13k_sp exp/chain/tdnn_lstm1b13l_sp
+# System                tdnn_lstm1b13k_sp tdnn_lstm1b13l_sp
+#WER dev93 (tgpr)                7.94      7.46
+#WER dev93 (tg)                  7.68      7.09
+#WER dev93 (big-dict,tgpr)       5.91      5.39
+#WER dev93 (big-dict,fg)         5.56      4.94
+#WER eval92 (tgpr)               5.65      5.44
+#WER eval92 (tg)                 5.32      5.09
+#WER eval92 (big-dict,tgpr)      3.49      3.15
+#WER eval92 (big-dict,fg)        3.07      2.94
+# Final train prob        -0.0491   -0.0513
+# Final valid prob        -0.0600   -0.0599
+# Final train prob (xent)   -0.7395   -0.7490
+# Final valid prob (xent)   -0.7762   -0.7860
+# Num-params                 5456076   4922060
+
+# 1b13k is as 1b13d, but after a script fix: previously we were using the 'c'
+# for the full-matrix part of the recurrence instead of the 'm'.
+
+# 1b13d is as 1b13c, but a rerun after fixing a code bug whereby the natural gradient
+# for the LinearComponent was turned off by default when initializing from config.
+#   **Update: turns out there was no difference here, the code had been ignoring
+#     that config variable.**
+#
+# It seems to optimize better, although the WER change is unclear.  However, it's
+# interesting that the average objf in the individual training jobs (train.*.log) is not better-
+# but in compute_prob_train.*.log it is.  It seems that the natural gradient interacts
+# well with model averaging, which is what we found previously in the NG paper.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b13c_sp exp/chain/tdnn_lstm1b13d_sp
+# System                tdnn_lstm1b13c_sp tdnn_lstm1b13d_sp
+#WER dev93 (tgpr)                7.68      7.86
+#WER dev93 (tg)                  7.34      7.40
+#WER dev93 (big-dict,tgpr)       5.42      5.65
+#WER dev93 (big-dict,fg)         5.05      5.11
+#WER eval92 (tgpr)               5.48      5.64
+#WER eval92 (tg)                 5.26      5.17
+#WER eval92 (big-dict,tgpr)      3.23      3.21
+#WER eval92 (big-dict,fg)        2.82      2.84
+# Final train prob        -0.0490   -0.0469
+# Final valid prob        -0.0597   -0.0601
+# Final train prob (xent)   -0.7549   -0.7424
+# Final valid prob (xent)   -0.7910   -0.7920
+# Num-params                 5456076   5456076
+#
+#
+# 1b13c is as 1b13b, but after script change in which the lstmb layer was
+# rewritten, adding memnorm and removing the scale of 4.0, along with some
+#  more minor changes and streamlining/removing options.
+#
+# 1b13b is as 1b13, but a rerun after merging with the memnorm-and-combine
+#   branch.  Slight difference in num-params is because of 300 vs 304.
+
+# 1b13 is as 1b10 but reducing the bottleneck dim to 304
+# (because I want to get in the habit of using multiples of 8).
+#  WER seems improved.
+#
+#
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b10_sp exp/chain/tdnn_lstm1b13_sp
+# System                tdnn_lstm1b10_sp tdnn_lstm1b13_sp
+#WER dev93 (tgpr)                7.87      7.63
+#WER dev93 (tg)                  7.48      7.46
+#WER dev93 (big-dict,tgpr)       5.55      5.56
+#WER dev93 (big-dict,fg)         5.25      5.09
+#WER eval92 (tgpr)               5.44      5.48
+#WER eval92 (tg)                 5.05      5.12
+#WER eval92 (big-dict,tgpr)      3.24      3.17
+#WER eval92 (big-dict,fg)        2.73      2.60
+# Final train prob        -0.0463   -0.0470
+# Final valid prob        -0.0561   -0.0565
+# Final train prob (xent)   -0.7362   -0.7588
+# Final valid prob (xent)   -0.7730   -0.7831
+# Num-params                 5650636   5446348
+
+# 1b10 is as 1b9 but reducing the cell and bottleneck dimension of LSTM layer from 512 to 384.
+# Seems helpful on average-- nice!
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b9_sp exp/chain/tdnn_lstm1b10_sp
+# System                tdnn_lstm1b9_sp tdnn_lstm1b10_sp
+#WER dev93 (tgpr)                7.74      7.87
+#WER dev93 (tg)                  7.46      7.48
+#WER dev93 (big-dict,tgpr)       5.67      5.55
+#WER dev93 (big-dict,fg)         5.31      5.25
+#WER eval92 (tgpr)               5.60      5.44
+#WER eval92 (tg)                 5.42      5.05
+#WER eval92 (big-dict,tgpr)      3.47      3.24
+#WER eval92 (big-dict,fg)        3.07      2.73
+# Final train prob        -0.0413   -0.0463
+# Final valid prob        -0.0543   -0.0561
+# Final train prob (xent)   -0.6786   -0.7362
+# Final valid prob (xent)   -0.7249   -0.7730
+# Num-params                 7021644   5650636
+
+# 1b9 is as 1b8 but adding batchnorm after the LSTM layer.. this is
+#  to correct an oversight.
+# 1b8 is as 1b7 but with quite a few layers removed.  WER effect is unclear.
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1b7_sp exp/chain/tdnn_lstm1b8_sp
+# System                tdnn_lstm1b7_sp tdnn_lstm1b8_sp
+#WER dev93 (tgpr)                7.31      7.60
+#WER dev93 (tg)                  7.10      7.25
+#WER dev93 (big-dict,tgpr)       5.26      5.26
+#WER dev93 (big-dict,fg)         4.64      4.93
+#WER eval92 (tgpr)               5.48      5.32
+#WER eval92 (tg)                 5.00      5.07
+#WER eval92 (big-dict,tgpr)      3.35      3.31
+#WER eval92 (big-dict,fg)        2.99      2.84
+# Final train prob        -0.0483   -0.0533
+# Final valid prob        -0.0573   -0.0627
+# Final train prob (xent)   -0.7207   -0.8234
+# Final valid prob (xent)   -0.7467   -0.8466
+# Num-params                11752524   7021644
+
+# 1b7 is as 1b6 but adding self-stabilize=true and normalize-type=none;
+# and after a script-level change that scale 'c' by 4 before giving it
+# to the W_all_a matrix (to see where all this came from, look at run_tdnn_lstm_1b16.sh
+# in the mini_librispeech setup, although by the time you see this, that may no longer exist).
+#
+# 1b6 is as 1b3 but replacing renorm with batchnorm for the TDNN layers,
+# and adding batchnorm to the LSTMB layers.  Effect on WER unclear but generally
+# it's better.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1{a2,a3,b3,b6}_sp
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a2_sp exp/chain/tdnn_lstm1a3_sp exp/chain/tdnn_lstm1b3_sp exp/chain/tdnn_lstm1b6_sp
+# System                tdnn_lstm1a2_sp tdnn_lstm1a3_sp tdnn_lstm1b3_sp tdnn_lstm1b6_sp
+#WER dev93 (tgpr)                7.47      7.65      7.26      7.32
+#WER dev93 (tg)                  7.29      7.24      6.96      6.98
+#WER dev93 (big-dict,tgpr)       5.44      5.60      5.43      5.22
+#WER dev93 (big-dict,fg)         4.98      5.04      4.97      4.86
+#WER eval92 (tgpr)               5.78      5.21      5.30      5.14
+#WER eval92 (tg)                 5.44      5.00      4.87      4.82
+#WER eval92 (big-dict,tgpr)      3.35      3.23      3.42      3.24
+#WER eval92 (big-dict,fg)        2.99      2.96      3.03      2.82
+# Final train prob        -0.0447   -0.0410   -0.0484   -0.0503
+# Final valid prob        -0.0566   -0.0518   -0.0594   -0.0599
+# Final train prob (xent)   -0.6859   -0.6676   -0.7528   -0.7415
+# Final valid prob (xent)   -0.7378   -0.7230   -0.8078   -0.7804
+# Num-params                 9106252   9106252  11747916  11746380
+
+# 1b3 is as 1a2 but with the same change as in a->b, replacing lstmp with lstmb
+# 1a2 is as 1a but adding l2 regularization.
+
+# this is a TDNN+LSTM chain system.
+# It was modified from local/nnet3/tuning/run_tdnn_lstm_lfr_1a.sh with
+# reference to ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh.
+# Note: we're using the same hidden-layer sizes as
+# ../../tedlium/s5_r2/local/chain/run_tdnn_lstm_1e.sh despite the
+# fact that we'd normally choose a smaller model for a setup with
+# less data, because the Tedlium model was probably on the small side.
+# Note: we normally use more parameters for LSTM-containing than TDNN-only
+# systems.
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=120 nj=2..10 num-params=9.1M dim=40+100->2889 combine=-0.047->-0.045 xent:train/valid[79,119,final]=(-0.684,-0.569,-0.564/-0.742,-0.668,-0.665) logprob:train/valid[79,119,final]=(-0.045,-0.035,-0.034/-0.058,-0.051,-0.051)
+
+# The following compares:
+# (nnet3 TDNN+LSTM, chain TDNN, this experiment == chain TDNN+LSTM)
+# system.
+# This is consistently better than the nnet3 TDNN+LSTM, but the
+# difference with the chain TDNN is inconsistent.
+
+# local/chain/compare_wer.sh --online exp/nnet3/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/tdnn_lstm1a_sp
+# System                tdnn_lstm1a_sp tdnn1a_sp tdnn_lstm1a_sp
+#WER dev93 (tgpr)                8.54      7.87      7.48
+#             [online:]          8.57      8.02      7.49
+#WER dev93 (tg)                  8.25      7.61      7.41
+#             [online:]          8.34      7.70      7.40
+#WER dev93 (big-dict,tgpr)       6.24      5.71      5.64
+#             [online:]          6.40      5.60      5.70
+#WER dev93 (big-dict,fg)         5.70      5.10      5.40
+#             [online:]          5.77      5.21      5.19
+#WER eval92 (tgpr)               6.52      5.23      5.67
+#             [online:]          6.56      5.44      5.60
+#WER eval92 (tg)                 6.13      4.87      5.46
+#             [online:]          6.24      4.87      5.53
+#WER eval92 (big-dict,tgpr)      3.88      3.24      3.69
+#             [online:]          3.88      3.31      3.63
+#WER eval92 (big-dict,fg)        3.38      2.71      3.28
+#             [online:]          3.53      2.92      3.31
+# Final train prob                  -0.0414   -0.0341
+# Final valid prob                  -0.0634   -0.0506
+# Final train prob (xent)             -0.8216   -0.5643
+# Final valid prob (xent)             -0.9208   -0.6648
+
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train_si284
+test_sets="test_dev93 test_eval92"
+gmm=tri4b        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+num_threads_ubm=32
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1b22c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+label_delay=8
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+
+# training options
+srand=0
+remove_egs=true
+
+#decode options
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --num-threads-ubm $num_threads_ubm \
+  --nnet3-affix "$nnet3_affix"
+
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+# note: you don't necessarily have to change the treedir name
+# each time you do a new experiment-- only if you change the
+# configuration in a way that affects the tree.
+tree_dir=exp/chain${nnet3_affix}/tree_a_sp
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 13 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 14 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 15 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.01"
+  output_opts="l2-regularize=0.005 bottleneck-dim=256"
+  lstm_opts="l2-regularize=0.005 self-scale=2.0"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda delay=5 input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=448
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts dim=448 input=Append(-1,0,1)
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts dim=448 input=Append(-3,0,3)
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts dim=448 input=Append(-3,0,3)
+  lstmb-layer name=lstm3 $lstm_opts cell-dim=384 bottleneck-dim=256 decay-time=20 delay=-3
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 $output_opts output-delay=$label_delay include-log-softmax=false dim=$num_targets
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 $output_opts output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 16 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/tedlium-$(date +'%m_%d_%H_%M')/s5_r2/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=10 \
+    --trainer.optimization.initial-effective-lrate=0.0005 \
+    --trainer.optimization.final-effective-lrate=0.00005 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 17 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgpr \
+    $tree_dir $tree_dir/graph_tgpr || exit 1;
+
+  utils/lang/check_phones_compatible.sh \
+    data/lang_test_bd_tgpr/phones.txt $lang/phones.txt
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_bd_tgpr \
+    $tree_dir $tree_dir/graph_bd_tgpr || exit 1;
+fi
+
+if [ $stage -le 18 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if [ $stage -le 19 ]; then
+  # 'looped' decoding.
+  # note: you should NOT do this decoding step for setups that have bidirectional
+  # recurrence, like BLSTMs-- it doesn't make sense and will give bad results.
+  # we didn't write a -parallel version of this program yet,
+  # so it will take a bit longer as the --num-threads option is not supported.
+  # we just hardcode the --frames-per-chunk option as it doesn't have to
+  # match any value used in training, and it won't affect the results (unlike
+  # regular decoding).
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      for lmtype in tgpr bd_tgpr; do
+        steps/nnet3/decode_looped.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --frames-per-chunk 30 \
+          --nj $nspk --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_${lmtype} data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}/decode_looped_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}/decode_looped_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+if $test_online_decoding && [ $stage -le 20 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      data_affix=$(echo $data | sed s/test_//)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      for lmtype in tgpr bd_tgpr; do
+        steps/online/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $nspk --cmd "$decode_cmd" \
+          $tree_dir/graph_${lmtype} data/${data} ${dir}_online/decode_${lmtype}_${data_affix} || exit 1
+      done
+      steps/lmrescore.sh \
+        --self-loop-scale 1.0 \
+        --cmd "$decode_cmd" data/lang_test_{tgpr,tg} \
+        data/${data}_hires ${dir}_online/decode_{tgpr,tg}_${data_affix} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_bd_{tgpr,fgconst} \
+       data/${data}_hires ${dir}_online/decode_${lmtype}_${data_affix}{,_fg} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
index 40e1fcecce9..ad03b557bfe 100755
--- a/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
+++ b/egs/wsj/s5/steps/cleanup/internal/resolve_ctm_edits_overlaps.py
@@ -7,7 +7,7 @@
 
 """
 Script to combine ctms edits with overlapping segments obtained from
-smith-waterman alignment. This script is similar to resolve_ctm_edits.py,
+smith-waterman alignment. This script is similar to utils/ctm/resolve_ctm_edits.py,
 where the overlapping region is just split in two. The approach here is a
 little more advanced since we have access to the WER
 (w.r.t. the reference text). It finds the WER of the overlapped region
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 9c48744fadc..140a6f9140c 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -647,9 +647,10 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
       KALDI_ERR << "Matrix m" << matrix_index << " is accessed before "
           "it is initialized";
     }
-    if (accesses.accesses.size() == 1) {
+    if (accesses.accesses.size() == 1 && config_.check_unused_variables) {
       int32 first_access_command = accesses.accesses[0].command_index;
       if (computation_.commands[first_access_command].command_type == kSetConst) {
+        if (!config_.check_unused_variables)
         KALDI_ERR << "Matrix m" << matrix_index << " is only set to a constant "
                   << "value, but then never accessed.";
       }
@@ -1081,6 +1082,23 @@ int32 ComputationAnalysis::FirstNontrivialAccess(int32 s) const {
 }
 
 
+int32 ComputationAnalysis::FirstAccess(int32 s) const {
+  KALDI_ASSERT(static_cast<size_t>(s) < computation_.submatrices.size() && s>0);
+  int32 ans = computation_.commands.size();
+  std::vector<int32> variable_indexes;
+  analyzer_.variables.AppendVariablesForSubmatrix(s, &variable_indexes);
+  std::vector<int32>::const_iterator iter = variable_indexes.begin(),
+          end = variable_indexes.end();
+  for (; iter != end; ++iter) {
+    int32 v = *iter;
+    const std::vector<Access> &accesses = analyzer_.variable_accesses[v];
+    if (!accesses.empty())
+      ans = std::min(ans, accesses[0].command_index);
+  }
+  return ans;
+}
+
+
 int32 ComputationAnalysis::FirstNontrivialMatrixAccess(int32 m) const {
   KALDI_ASSERT(static_cast<size_t>(m) < computation_.matrices.size() && m > 0);
   int32 ans = computation_.commands.size();
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 259a4546d53..a82cd4cb5b1 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -321,6 +321,13 @@ class ComputationAnalysis {
   /// s must be >0 (i.e. not the empty submatrix).
   int32 FirstNontrivialAccess(int32 s) const;
 
+  /// Returns the first command (read or write) that accesses any part of 's',
+  /// including possibly zeroing it.  [note: kAllocMatrix, kSwapMatrix and
+  /// kDeallocMatrix do not count as read or write operations].  If there is no
+  /// such command, it returns num_commands.  s must be >0 (i.e. not the empty
+  /// submatrix).
+  int32 FirstAccess(int32 s) const;
+
   /// Returns the last non-deallocation command that accesses any part of
   /// submatrix 's'; if there is no such command it returns -1.
   /// s must be >0 (i.e. not the empty submatrix).
@@ -385,7 +392,7 @@ struct CheckComputationOptions {
   // legitimately fail after optimization.  see code for details.
   bool check_rewrite;
   // If 'check_unused_variables' is true, it checks for unused variables
-  // (e.g. unused partsof matrices).  We only set it false for online
+  // (e.g. unused parts of matrices).  We only set it false for online
   // computations, where there can be instances where a part of a matrix is
   // apparently never accessed (until we consider that the matrix is swapped
   // with another).
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index ded700dbbd8..2a0b2dcd499 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -3464,13 +3464,12 @@ class ComputationLoopedOptimizer {
   /// expected to be command indexes of the kNoOperationMarker at segment
   /// boundaries, this function outputs for each of these command indexes a list
   /// of matrices which are 'active' at that point in time.  By 'active' we mean
-  /// that the matrix has been written to before that time (note, we don't count
-  /// initialization with zeros as being written to); and will be read after
-  /// that time.  These is the list of matrices that 'need to be in scope'
-  /// at those points in time.  '*active_matrices' is indexed by the
-  /// same index as 'splice_point_commands', and is then a list of active
-  /// matrices, in numerical order of matrix index.
-  /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
+  /// that the matrix has been written to before that time (including zeroing),
+  /// and will be read after that time.  These is the list of matrices that
+  /// 'need to be in scope' at those points in time.  '*active_matrices' is
+  /// indexed by the same index as 'splice_point_commands', and is then a list
+  /// of active matrices, in numerical order of matrix index.  Note: for each i,
+  /// (*active_matrices)[i] will be sorted and unique.
   static void FindActiveMatrices(const NnetComputation &computation,
                                  const Analyzer &analyzer,
                                  const std::vector<int32> &splice_point_commands,
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index e12cb7b1c42..0eb5de2c4fc 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -543,7 +543,9 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
-  if (config.optimize && config.move_sizing_commands) {
+
+  if ((config.optimize && config.move_sizing_commands) ||
+      config.optimize_looped_computation) {
     MoveSizingCommands(nnet, computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);

From fd996017b8c9d4464f345ab56bc879b5ca25cb7a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 16 Jan 2018 22:28:33 -0500
Subject: [PATCH 069/184] [egs] Fix rt03 numbers to include swbd (thx: Gaofeng)

---
 .../s5c/local/chain/compare_wer_general.sh    |   6 +-
 .../s5c/local/chain/tuning/run_tdnn_7m19b.sh  | 368 ++++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m19c.sh  |  19 +-
 3 files changed, 374 insertions(+), 19 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh

diff --git a/egs/swbd/s5c/local/chain/compare_wer_general.sh b/egs/swbd/s5c/local/chain/compare_wer_general.sh
index fcd66d5d78d..6412a46e86a 100755
--- a/egs/swbd/s5c/local/chain/compare_wer_general.sh
+++ b/egs/swbd/s5c/local/chain/compare_wer_general.sh
@@ -132,7 +132,7 @@ if $include_rt03; then
   echo -n "# WER on rt03(tg)      "
   for x in $*; do
     set_names $x
-    wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    wer=$(grep Sum $dirname/decode_rt03*sw1_tg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
     printf "% 10s" $wer
   done
   echo
@@ -141,7 +141,7 @@ if $include_rt03; then
     echo -n "#           [looped:]  "
     for x in $*; do
       set_names $x
-      wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+      wer=$(grep Sum $dirname/decode_rt03*sw1_tg${epoch_suffix}_looped/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
       printf "% 10s" $wer
     done
     echo
@@ -150,7 +150,7 @@ if $include_rt03; then
   echo -n "# WER on rt03(fg)      "
   for x in $*; do
     set_names $x
-    wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/*ys | grep -v swbd | utils/best_wer.sh | awk '{print $2}')
+    wer=$(grep Sum $dirname/decode_rt03*sw1_fsh_fg$epoch_suffix/score*/rt03_hires.ctm.filt.sys | utils/best_wer.sh | awk '{print $2}')
     printf "% 10s" $wer
   done
   echo
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
new file mode 100755
index 00000000000..aaebe038e99
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
@@ -0,0 +1,368 @@
+#!/bin/bash
+# TODO: this will be moved before merging to master.
+
+# 7m19b is as 7m19 but with some bypass connections.  Helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            19.1      19.0
+# WER on rt03(fg)            16.6      16.4
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m19b
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3,tdnn4) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3,tdnn6) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3,tdnn8) dim=1280 bottleneck-dim=256
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn10 $opts dim=1280 bottleneck-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn10 $opts dim=1280 bottleneck-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
index 8cc029744a1..5fe29ac3562 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
@@ -2,7 +2,7 @@
 # Note: before merging to master, this will be renamed.
 
 # 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
+#   Effect is unclear.
 
 # local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
 # System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
@@ -10,27 +10,14 @@
 # WER on train_dev(fg)      12.12     11.87     11.82
 # WER on eval2000(tg)        15.8      15.6      15.4
 # WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
+# WER on rt03(tg)            19.1      19.0      19.1
+# WER on rt03(fg)            16.6      16.4      16.6
 # Final train prob         -0.096    -0.096    -0.094
 # Final valid prob         -0.106    -0.106    -0.103
 # Final train prob (xent)        -1.198    -1.188    -1.117
 # Final valid prob (xent)       -1.2070   -1.1980   -1.1223
 # Num-parameters               15528996  16512036  17824036
 
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
 
 # 7m19 is as 7m16 but adding an extra -3,0,3 layer.
 # CAUTION: messing with queue opts.

From b7691d466283fa733bdeeb2006300fd67e4b4c9d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 18 Jan 2018 19:12:27 -0500
Subject: [PATCH 070/184] [egs] Add new tuning script (not in its final place)

---
 .../s5c/local/chain/tuning/run_tdnn_7m19b.sh  |  26 +-
 .../s5c/local/chain/tuning/run_tdnn_7m19h.sh  | 428 ++++++++++++++++++
 2 files changed, 441 insertions(+), 13 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
index aaebe038e99..fdc4b63d59b 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
@@ -3,19 +3,19 @@
 
 # 7m19b is as 7m19 but with some bypass connections.  Helpful.
 
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            19.1      19.0
-# WER on rt03(fg)            16.6      16.4
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.77     13.09     12.93
+# WER on train_dev(fg)      12.65     12.12     11.87
+# WER on eval2000(tg)        16.1      15.8      15.6
+# WER on eval2000(fg)        14.3      14.3      14.0
+# WER on rt03(tg)            19.9      19.1      19.0
+# WER on rt03(fg)            17.4      16.6      16.4
+# Final train prob         -0.111    -0.096    -0.096
+# Final valid prob         -0.120    -0.106    -0.106
+# Final train prob (xent)        -1.314    -1.198    -1.188
+# Final valid prob (xent)       -1.3247   -1.2070   -1.1980
+# Num-parameters               13361700  15528996  16512036
 
 
 # 7m19 is as 7m16 but adding an extra -3,0,3 layer.
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
new file mode 100755
index 00000000000..b509517da68
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
@@ -0,0 +1,428 @@
+#!/bin/bash
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m19h
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1536 bottleneck-dim=256
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;

From 85a2c60e3697e54711e33fbff854f218597745c7 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sat, 20 Jan 2018 21:34:27 +0330
Subject: [PATCH 071/184] [src] Print informative error if num-ceps >=
 num-mel-bins in MFCC (#2166)

---
 src/feat/feature-mfcc.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/feat/feature-mfcc.cc b/src/feat/feature-mfcc.cc
index c1962a5c1d1..122ba1b100d 100644
--- a/src/feat/feature-mfcc.cc
+++ b/src/feat/feature-mfcc.cc
@@ -82,7 +82,14 @@ void MfccComputer::Compute(BaseFloat signal_log_energy,
 MfccComputer::MfccComputer(const MfccOptions &opts):
     opts_(opts), srfft_(NULL),
     mel_energies_(opts.mel_opts.num_bins) {
+
   int32 num_bins = opts.mel_opts.num_bins;
+  if (opts.num_ceps > num_bins)
+    KALDI_ERR << "num-ceps cannot be larger than num-mel-bins."
+              << " It should be smaller or equal. You provided num-ceps: "
+              << opts.num_ceps << "  and num-mel-bins: "
+              << num_bins;
+
   Matrix<BaseFloat> dct_matrix(num_bins, num_bins);
   ComputeDctMatrix(&dct_matrix);
   // Note that we include zeroth dct in either case.  If using the

From 35950ea2461f63e7de9423456c13abb22a396ac7 Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Mon, 22 Jan 2018 02:50:14 +0800
Subject: [PATCH 072/184] [egs] add speaker recognition recipe for aishell
 (#2111)

---
 egs/aishell/README.txt                        |   4 +
 egs/aishell/v1/README.txt                     |   8 ++
 egs/aishell/v1/cmd.sh                         |  15 +++
 egs/aishell/v1/conf/mfcc.conf                 |   3 +
 egs/aishell/v1/conf/vad.conf                  |   2 +
 egs/aishell/v1/local/aishell_data_prep.sh     |  63 +++++++++++
 egs/aishell/v1/local/download_and_untar.sh    | 105 ++++++++++++++++++
 egs/aishell/v1/local/produce_trials.py        |  35 ++++++
 .../v1/local/split_data_enroll_eval.py        |  37 ++++++
 egs/aishell/v1/path.sh                        |   5 +
 egs/aishell/v1/run.sh                         |  94 ++++++++++++++++
 egs/aishell/v1/sid                            |   1 +
 egs/aishell/v1/steps                          |   1 +
 egs/aishell/v1/utils                          |   1 +
 14 files changed, 374 insertions(+)
 create mode 100644 egs/aishell/v1/README.txt
 create mode 100644 egs/aishell/v1/cmd.sh
 create mode 100644 egs/aishell/v1/conf/mfcc.conf
 create mode 100644 egs/aishell/v1/conf/vad.conf
 create mode 100755 egs/aishell/v1/local/aishell_data_prep.sh
 create mode 100755 egs/aishell/v1/local/download_and_untar.sh
 create mode 100755 egs/aishell/v1/local/produce_trials.py
 create mode 100755 egs/aishell/v1/local/split_data_enroll_eval.py
 create mode 100755 egs/aishell/v1/path.sh
 create mode 100755 egs/aishell/v1/run.sh
 create mode 120000 egs/aishell/v1/sid
 create mode 120000 egs/aishell/v1/steps
 create mode 120000 egs/aishell/v1/utils

diff --git a/egs/aishell/README.txt b/egs/aishell/README.txt
index 0dcea0977cc..f37e6dfaeb5 100644
--- a/egs/aishell/README.txt
+++ b/egs/aishell/README.txt
@@ -5,5 +5,9 @@ Aishell is an open Chinese Mandarin speech database published by Beijing Shell S
 The database can be downloaded from openslr:
 http://www.openslr.org/33/
 
+This folder contains two subfolders:
+s5: a speech recognition recipe
+v1: a speaker recognition recipe
+
 For more details, please visit:
 http://www.aishelltech.com/kysjcp
diff --git a/egs/aishell/v1/README.txt b/egs/aishell/v1/README.txt
new file mode 100644
index 00000000000..43e26d0817d
--- /dev/null
+++ b/egs/aishell/v1/README.txt
@@ -0,0 +1,8 @@
+The database can be downloaded from openslr:
+http://www.openslr.org/33/
+
+For more details, please visit:
+http://www.aishelltech.com/kysjcp
+
+We use the training set to train model,
+and split the test set into enroll and eval.
diff --git a/egs/aishell/v1/cmd.sh b/egs/aishell/v1/cmd.sh
new file mode 100644
index 00000000000..d1ca1a6d126
--- /dev/null
+++ b/egs/aishell/v1/cmd.sh
@@ -0,0 +1,15 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+
+
diff --git a/egs/aishell/v1/conf/mfcc.conf b/egs/aishell/v1/conf/mfcc.conf
new file mode 100644
index 00000000000..f40379a0803
--- /dev/null
+++ b/egs/aishell/v1/conf/mfcc.conf
@@ -0,0 +1,3 @@
+--sample-frequency=16000
+--num-mel-bins=40 #higher than the default which is 23
+--num-ceps=20 # higher than the default which is 12.
diff --git a/egs/aishell/v1/conf/vad.conf b/egs/aishell/v1/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/aishell/v1/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/aishell/v1/local/aishell_data_prep.sh b/egs/aishell/v1/local/aishell_data_prep.sh
new file mode 100755
index 00000000000..70d6ba1f3e5
--- /dev/null
+++ b/egs/aishell/v1/local/aishell_data_prep.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+. ./path.sh || exit 1;
+
+if [ $# != 2 ]; then
+  echo "Usage: $0 <audio-path> <text-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text_dir=$2
+
+train_dir=data/local/train
+dev_dir=data/local/dev
+test_dir=data/local/test
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -d $aishell_text_dir ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" | grep -i "wav/train" > $train_dir/wav.flist || exit 1;
+find $aishell_audio_dir -iname "*.wav" | grep -i "wav/dev" > $dev_dir/wav.flist || exit 1;
+find $aishell_audio_dir -iname "*.wav" | grep -i "wav/test" > $test_dir/wav.flist || exit 1;
+
+n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+# Transcriptions preparation
+for dir in $train_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\
+    sort > $dir/utt.list
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\
+    sort > $dir/utt2spk_all
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+  utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+done
+
+mkdir -p data/train data/test
+for f in spk2utt utt2spk wav.scp text; do
+  cp $train_dir/$f data/train/$f || exit 1;
+  cp $test_dir/$f data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/v1/local/download_and_untar.sh b/egs/aishell/v1/local/download_and_untar.sh
new file mode 100755
index 00000000000..0189bad1d4a
--- /dev/null
+++ b/egs/aishell/v1/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.gz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/aishell/v1/local/produce_trials.py b/egs/aishell/v1/local/produce_trials.py
new file mode 100755
index 00000000000..d01f7eb7da3
--- /dev/null
+++ b/egs/aishell/v1/local/produce_trials.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# Copyright 2017 Bengu Wu
+# Apache 2.0.
+
+# This script generate trials file.
+# Trial file is formatted as:
+# uttid spkid target|nontarget
+
+# If uttid belong to spkid, it is marked 'target',
+# otherwise is 'nontarget'.
+# input: eval set uttspk file
+# output: trial file
+
+import sys
+
+fnutt = sys.argv[1]
+ftrial = open(sys.argv[2], 'w')
+
+dictutt = {}
+for line in open(fnutt):
+  utt2spk = line.rstrip('\r\t\n ')
+  spk = utt2spk.split(' ')[1]
+  if spk not in dictutt:
+    dictutt[spk] = spk
+
+for line in open(fnutt):
+  utt2spk = line.rstrip('\r\t\n ')
+  utt, spk = utt2spk.split(' ')
+  for target in dictutt:
+    if target == spk:
+      trial = utt + ' ' + target + ' target'
+    else:
+      trial = utt + ' ' + target + ' nontarget'
+    ftrial.write(trial + '\n')
+ftrial.close()
diff --git a/egs/aishell/v1/local/split_data_enroll_eval.py b/egs/aishell/v1/local/split_data_enroll_eval.py
new file mode 100755
index 00000000000..7aa45121c17
--- /dev/null
+++ b/egs/aishell/v1/local/split_data_enroll_eval.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+
+# Copyright 2017 Bengu Wu
+# Apache 2.0.
+
+# This script splits the test set utt2spk into enroll set and eval set
+# For each speaker, 3 utterances are randomly selected as enroll samples,
+# and the others are used as eval samples for evaluation
+# input: test utt2spk
+# output: enroll utt2spk, eval utt2spk
+
+import sys,random
+
+dictutt = {}
+
+for line in open(sys.argv[1]):
+  line = line.rstrip('\r\t\n ')
+  utt, spk = line.split(' ')
+  if spk not in dictutt:
+    dictutt[spk] = []
+  dictutt[spk].append(utt)
+
+fenroll = open(sys.argv[2], 'w')
+feval = open(sys.argv[3], 'w')
+
+for key in dictutt:
+  utts = dictutt[key]
+  random.shuffle(utts)
+  for i in range(0, len(utts)):
+    line = utts[i] + ' ' + key
+    if(i < 3):
+      fenroll.write(line + '\n')
+    else:
+      feval.write(line + '\n')
+
+fenroll.close()
+feval.close()
diff --git a/egs/aishell/v1/path.sh b/egs/aishell/v1/path.sh
new file mode 100755
index 00000000000..e50f57c5271
--- /dev/null
+++ b/egs/aishell/v1/path.sh
@@ -0,0 +1,5 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/aishell/v1/run.sh b/egs/aishell/v1/run.sh
new file mode 100755
index 00000000000..0aaa6d493d6
--- /dev/null
+++ b/egs/aishell/v1/run.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+# Copyright 2017 Beijing Shell Shell Tech. Co. Ltd. (Authors: Hui Bu)
+#           2017 Jiayu Du
+#           2017 Chao Li
+#           2017 Xingyu Na
+#           2017 Bengu Wu
+#           2017 Hao Zheng
+# Apache 2.0
+
+# This is a shell script that we demonstrate speech recognition using AIShell-1 data.
+# it's recommended that you run the commands one by one by copying and pasting into the shell.
+# See README.txt for more info on data required.
+# Results (EER) are inline in comments below
+
+data=/export/a05/xna/data
+data_url=www.openslr.org/resources/33
+
+. ./cmd.sh
+. ./path.sh
+
+set -e # exit on error
+
+local/download_and_untar.sh $data $data_url data_aishell
+local/download_and_untar.sh $data $data_url resource_aishell
+
+# Data Preparation
+local/aishell_data_prep.sh $data/data_aishell/wav $data/data_aishell/transcript
+
+# Now make MFCC  features.
+# mfccdir should be some place with a largish disk where you
+# want to store MFCC features.
+mfccdir=mfcc
+for x in train test; do
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/$x exp/make_mfcc/$x $mfccdir
+  sid/compute_vad_decision.sh --nj 10 --cmd "$train_cmd" data/$x exp/make_mfcc/$x $mfccdir
+  utils/fix_data_dir.sh data/$x
+done
+
+# train diag ubm
+sid/train_diag_ubm.sh --nj 10 --cmd "$train_cmd" --num-threads 16 \
+  data/train 1024 exp/diag_ubm_1024
+
+#train full ubm
+sid/train_full_ubm.sh --nj 10 --cmd "$train_cmd" data/train \
+  exp/diag_ubm_1024 exp/full_ubm_1024
+
+#train ivector
+sid/train_ivector_extractor.sh --cmd "$train_cmd --mem 10G" \
+  --num-iters 5 exp/full_ubm_1024/final.ubm data/train \
+  exp/extractor_1024
+
+#extract ivector
+sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
+  exp/extractor_1024 data/train exp/ivector_train_1024
+
+#train plda
+$train_cmd exp/ivector_train_1024/log/plda.log \
+  ivector-compute-plda ark:data/train/spk2utt \
+  'ark:ivector-normalize-length scp:exp/ivector_train_1024/ivector.scp  ark:- |' \
+  exp/ivector_train_1024/plda
+
+#split the test to enroll and eval
+mkdir -p data/test/enroll data/test/eval
+cp data/test/{spk2utt,feats.scp,vad.scp} data/test/enroll
+cp data/test/{spk2utt,feats.scp,vad.scp} data/test/eval
+local/split_data_enroll_eval.py data/test/utt2spk  data/test/enroll/utt2spk  data/test/eval/utt2spk
+trials=data/test/aishell_speaker_ver.lst
+local/produce_trials.py data/test/eval/utt2spk $trials
+utils/fix_data_dir.sh data/test/enroll
+utils/fix_data_dir.sh data/test/eval
+
+#extract enroll ivector
+sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
+  exp/extractor_1024 data/test/enroll  exp/ivector_enroll_1024
+#extract eval ivector
+sid/extract_ivectors.sh --cmd "$train_cmd" --nj 10 \
+  exp/extractor_1024 data/test/eval  exp/ivector_eval_1024
+
+#compute plda score
+$train_cmd exp/ivector_eval_1024/log/plda_score.log \
+  ivector-plda-scoring --num-utts=ark:exp/ivector_enroll_1024/num_utts.ark \
+  exp/ivector_train_1024/plda \
+  ark:exp/ivector_enroll_1024/spk_ivector.ark \
+  "ark:ivector-normalize-length scp:exp/ivector_eval_1024/ivector.scp ark:- |" \
+  "cat '$trials' | awk '{print \\\$2, \\\$1}' |" exp/trials_out
+
+#compute eer
+awk '{print $3}' exp/trials_out | paste - $trials | awk '{print $1, $4}' | compute-eer -
+
+# Result
+# Scoring against data/test/aishell_speaker_ver.lst
+# Equal error rate is 0.140528%, at threshold -12.018
+
+exit 0
diff --git a/egs/aishell/v1/sid b/egs/aishell/v1/sid
new file mode 120000
index 00000000000..893a12f30c9
--- /dev/null
+++ b/egs/aishell/v1/sid
@@ -0,0 +1 @@
+../../sre08/v1/sid
\ No newline at end of file
diff --git a/egs/aishell/v1/steps b/egs/aishell/v1/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/aishell/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/aishell/v1/utils b/egs/aishell/v1/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/aishell/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file

From d6391f8640f1f92fe27340e87ad797abfb76da3d Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Mon, 22 Jan 2018 02:12:33 -0500
Subject: [PATCH 073/184] [scripts,egs] Fix bug in slurm.pl
 (thx:@kamo-naoyuki), remove outdated results.

---
 egs/fisher_swbd/s5/RESULTS         | 40 ------------------------------
 egs/wsj/s5/utils/parallel/slurm.pl |  4 +--
 2 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/egs/fisher_swbd/s5/RESULTS b/egs/fisher_swbd/s5/RESULTS
index 27de4757966..14a00465b51 100644
--- a/egs/fisher_swbd/s5/RESULTS
+++ b/egs/fisher_swbd/s5/RESULTS
@@ -40,46 +40,6 @@ for x in exp/nnet2_online/nnet_ms_a_online/decode_eval2000*_fg; do grep Sum  $x/
 %WER 12.3 | 1831 21395 | 89.2 7.2 3.5 1.5 12.3 50.8 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_fsh_sw1_fg/score_13/eval2000.ctm.swbd.filt.sys
 %WER 11.8 | 1831 21395 | 89.6 7.2 3.2 1.4 11.8 49.0 | exp/nnet2_online/nnet_ms_a_online/decode_eval2000_utt_offline_fsh_sw1_fg/score_11/eval2000.ctm.swbd.filt.sys
 
-# nnet3 result on eval2000
-# BLSTM ran for about 760 hours, command:
-# local/nnet3/run_lstm.sh --affix bidirectional --lstm-delay " [-1,1] [-2,2] [-3,3] " --label-delay 0 \
-#                         --cell-dim 1024 --recurrent-projection-dim 128 --non-recurrent-projection-dim 128 \
-#                         --chunk-left-context 40 --chunk-right-context 40 \
-#                         --extra-left-context 50 --extra-right-context 50
-# use tri-gram
-for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
-%WER 15.8 | 4459 42989 | 86.1 9.7 4.1 1.9 15.8 52.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
-%WER 14.8 | 4459 42989 | 87.2 9.4 3.4 2.1 14.8 52.2 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_13_0.0/eval2000_hires.ctm.filt.sys
-%WER 14.8 | 4459 42989 | 86.6 9.2 4.3 1.4 14.8 54.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
-# rescore with four-gram
-for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.filt.sys | utils/best_wer.sh ; done
-%WER 15.4 | 4459 42989 | 86.4 9.5 4.0 1.8 15.4 51.6 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
-%WER 14.5 | 4459 42989 | 87.5 9.0 3.5 2.0 14.5 51.4 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.filt.sys
-%WER 14.5 | 4459 42989 | 87.0 9.0 4.0 1.5 14.5 53.7 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
-
-# nnet3 result on eval2000 for swbd subset
-# use tri-gram
-for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
-%WER 11.6 | 1831 21395 | 89.7 7.3 3.0 1.3 11.6 47.7 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
-%WER 10.3 | 1831 21395 | 91.0 6.4 2.5 1.3 10.3 45.9 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_19_0.0/eval2000_hires.ctm.swbd.filt.sys
-%WER 10.7 | 1831 21395 | 90.3 6.7 3.0 1.0 10.7 45.9 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
-# rescore with four-gram
-for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
-%WER 11.1 | 1831 21395 | 90.2 7.0 2.8 1.3 11.1 46.2 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
-%WER 10.0 | 1831 21395 | 91.3 6.3 2.4 1.3 10.0 45.1 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_19_1.0/eval2000_hires.ctm.swbd.filt.sys
-%WER 10.4 | 1831 21395 | 90.6 6.5 2.9 1.0 10.4 45.3 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
-
-# nnet3 result on eval2000 for callhm subset
-# use tri-gram
-for x in exp/nnet3/*/decode_eval2000*tg; do grep Sum  $x/score_*/*.ctm.callhm.filt.sys | utils/best_wer.sh ; done
-%WER 19.9 | 2628 21594 | 82.6 12.1 5.3 2.6 19.9 56.0 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
-%WER 19.0 | 2628 21594 | 83.5 11.7 4.8 2.5 19.0 56.5 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_tg_epoch2.adj/score_14_0.5/eval2000_hires.ctm.callhm.filt.sys
-%WER 18.8 | 2628 21594 | 83.1 11.7 5.2 1.9 18.8 60.2 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
-# rescore with four-gram
-for x in exp/nnet3/*/decode_eval2000*fg; do grep Sum  $x/score_*/*.ctm.swbd.filt.sys | utils/best_wer.sh ; done
-%WER 19.7 | 2628 21594 | 82.7 12.1 5.2 2.4 19.7 55.3 | exp/nnet3/tdnn_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.callhm.filt.sys
-%WER 18.7 | 2628 21594 | 83.7 11.5 4.8 2.5 18.7 55.6 | exp/nnet3/tdnn_sp_smbr/decode_eval2000_fsh_sw1_fg_epoch2.adj/score_14_0.0/eval2000_hires.ctm.callhm.filt.sys
-%WER 18.6 | 2628 21594 | 83.3 11.5 5.2 1.9 18.6 59.6 | exp/nnet3/lstm_bidirectional_sp/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.callhm.filt.sys
 
 # chain result on eval2000
 # BLSTM ran for about 380 hours
diff --git a/egs/wsj/s5/utils/parallel/slurm.pl b/egs/wsj/s5/utils/parallel/slurm.pl
index baad329c937..919b32045a2 100755
--- a/egs/wsj/s5/utils/parallel/slurm.pl
+++ b/egs/wsj/s5/utils/parallel/slurm.pl
@@ -394,7 +394,7 @@ sub exec_command {
 print Q "if [ \"\$CUDA_VISIBLE_DEVICES\" == \"NoDevFiles\" ]; then\n";
 print Q "  ( echo CUDA_VISIBLE_DEVICES set to NoDevFiles, unsetting it... \n";
 print Q "  )>>$logfile\n";
-print Q "  unset CUDA_VISIBLE_DEVICES.\n";
+print Q "  unset CUDA_VISIBLE_DEVICES\n";
 print Q "fi\n";
 print Q "time1=\`date +\"%s\"\`\n";
 print Q " ( $cmd ) &>>$logfile\n";
@@ -506,7 +506,7 @@ sub exec_command {
         if ($squeue_status == 1) {
           # time to make sure it is not just delayed creation of the syncfile.
 
-          # Don't consider immediately missing job as error, first wait some  
+          # Don't consider immediately missing job as error, first wait some
           # time to make sure it is not just delayed creation of the syncfile.
           sleep(4);
           # Sometimes NFS gets confused and thinks it's transmitted the directory

From c3dd60f9aa9b1e126923916c281ff2e31a3a8bc4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 22 Jan 2018 17:59:09 -0500
Subject: [PATCH 074/184] [egs] Adding some of the more interesting tuning
 experiments; add xconfig layer for linear-component

---
 .../s5c/local/chain/tuning/run_tdnn_7m19m.sh  | 473 +++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m23b.sh  | 482 +++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m23b2.sh | 501 ++++++++++++++++++
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |   3 +-
 .../libs/nnet3/xconfig/trivial_layers.py      |  76 +++
 5 files changed, 1534 insertions(+), 1 deletion(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
new file mode 100755
index 00000000000..ed533b7da29
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
@@ -0,0 +1,473 @@
+#!/bin/bash
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m19m
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192
+  linear-component name=tdnn1l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192
+  linear-component name=tdnn3l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192
+  linear-component name=tdnn5l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  linear-component name=tdnn6l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 bottleneck-dim=256
+  linear-component name=tdnn7l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  linear-component name=tdnn8l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 bottleneck-dim=256
+  linear-component name=tdnn9l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
+  linear-component name=tdnn10l dim=512 $linear_opts
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 bottleneck-dim=256
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
new file mode 100755
index 00000000000..7b0f45e6899
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
@@ -0,0 +1,482 @@
+#!/bin/bash
+
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages...
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m23b
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1536
+  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn2l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1536
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn4l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192
+  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh
new file mode 100755
index 00000000000..9f943cf7d4d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh
@@ -0,0 +1,501 @@
+#!/bin/bash
+
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m23b2
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1536
+  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn2l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1536
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn4l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192
+  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index c6b0619bca8..6fbde1fbbcc 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -69,7 +69,8 @@
         'norm-pgru-layer' : xlayers.XconfigNormPgruLayer,
         'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer,
         'renorm-component': xlayers.XconfigRenormComponent,
-        'no-op-component': xlayers.XconfigNoOpComponent
+        'no-op-component': xlayers.XconfigNoOpComponent,
+        'linear-component': xlayers.XconfigLinearComponent
 }
 
 # Turn a config line and a list of previous layers into
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index ef05887e469..63f6278d1ca 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -121,3 +121,79 @@ def _generate_config(self):
             self.name, input_desc))
         configs.append(line)
         return configs
+
+
+class XconfigLinearComponent(XconfigLayerBase):
+    """This class is for parsing lines like
+     'linear-component name=linear1 dim=1024 input=Append(-3,0,3)'
+    which will produce just a single component, of type LinearComponent, with
+    output-dim 1024 in this case, and input-dim determined by the dimention
+    of the input .
+
+    Parameters of the class, and their defaults:
+      input='[-1]'             [Descriptor giving the input of the layer.]
+      dim=-1                   [Dimension of the output]
+
+    The following (shown with their effective defaults) are just passed through
+    to the component's config line.
+
+      orthonormal-constraint=-1
+      max-change=0.75
+      l2-regularize=0.0
+
+    """
+    def __init__(self, first_token, key_to_value, prev_names=None):
+        XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names)
+
+    def set_default_configs(self):
+        self.config = {'input': '[-1]',
+                       'dim': -1,
+                       'orthonormal-constraint': '',
+                       'max-change': 0.75,
+                       'l2-regularize': '' }
+
+    def check_configs(self):
+        if self.config['dim'] <= 0:
+            raise RuntimeError("'dim' must be specified and > 0.")
+
+    def output_name(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        return self.name
+
+    def output_dim(self, auxiliary_output=None):
+        assert auxiliary_output is None
+        assert self.config['dim'] > 0
+        return self.config['dim']
+
+    def get_full_config(self):
+        ans = []
+        config_lines = self._generate_config()
+
+        for line in config_lines:
+            for config_name in ['ref', 'final']:
+                # we do not support user specified matrices in this layer
+                # so 'ref' and 'final' configs are the same.
+                ans.append((config_name, line))
+        return ans
+
+    def _generate_config(self):
+        # by 'descriptor_final_string' we mean a string that can appear in
+        # config-files, i.e. it contains the 'final' names of nodes.
+        input_desc = self.descriptors['input']['final-string']
+        input_dim = self.descriptors['input']['dim']
+        output_dim = self.config['dim']
+
+        opts = ''
+        for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize']:
+            value = self.config[opt_name]
+            if value != '':
+                opts += ' {0}={1}'.format(opt_name, value)
+
+        configs = []
+        line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} '
+                '{3}'.format(self.name, input_dim, output_dim, opts))
+        configs.append(line)
+        line = ('component-node name={0} component={0} input={1}'.format(
+            self.name, input_desc))
+        configs.append(line)
+        return configs

From 96cbdd6974558fbee0e54bbe5474a61b7b52c237 Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Tue, 23 Jan 2018 11:26:28 +0800
Subject: [PATCH 075/184] [egs] Update chain results and add new scripts for
 fisher_swbd (related to #2136) (#2173)

---
 .../s5/local/chain/run_blstm_6j.sh            |  60 ++--
 .../s5/local/chain/run_tdnn_lstm_1a.sh        |   6 +
 .../s5/local/chain/run_tdnn_lstm_1b.sh        | 310 ++++++++++++++++++
 .../s5/local/chain/run_tdnn_opgru_1a.sh       |  62 ++--
 .../s5/local/chain/run_tdnn_opgru_1b.sh       | 308 +++++++++++++++++
 .../s5/local/chain/show_chain_wer.sh          |  19 --
 .../s5/local/chain/show_chain_wer_rt03.sh     |  20 --
 7 files changed, 700 insertions(+), 85 deletions(-)
 create mode 100644 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
 create mode 100644 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
 delete mode 100644 egs/fisher_swbd/s5/local/chain/show_chain_wer.sh
 delete mode 100644 egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh

diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
index 9810a03ee58..03d362ef552 100644
--- a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
@@ -5,31 +5,37 @@
 # The model training procedure is similar to run_blstm_6j.sh under egs/swbd/s5c
 
 # ./local/chain/compare_wer_general.sh blstm_6j_sp
-# System                 blstm_6j_sp
-# WER on eval2000(tg)      12.1
-# WER on eval2000(fg)      11.9
-# WER on rt03(tg)          11.9
-# WER on rt03(fg)          11.6
-# Final train prob         -0.059
-# Final valid prob         -0.072
-# Final train prob (xent)        -0.711
-# Final valid prob (xent)       -0.7782
+# System                blstm_6j_sp
+# WER on eval2000(tg)        12.3
+# WER on eval2000(fg)        12.2
+# WER on rt03(tg)        11.7
+# WER on rt03(fg)        11.5
+# Final train prob         -0.061
+# Final valid prob         -0.082
+# Final train prob (xent)        -0.698
+# Final valid prob (xent)       -0.8108
+# num-params=41.3M
+
+# ./steps/info/chain_dir_info.pl exp/chain/blstm_6j_sp
+# exp/chain/blstm_6j_sp: num-iters=2384 nj=3..16 num-params=41.3M dim=40+100->6149 combine=-0.075->-0.074 (over 15) 
+# xent:train/valid[1587,2383,final]=(-0.754,-0.710,-0.698/-0.828,-0.824,-0.811) 
+# logprob:train/valid[1587,2383,final]=(-0.070,-0.063,-0.061/-0.082,-0.084,-0.082)
 
 # ./local/chain/show_chain_wer.sh blstm_6j_sp
-# %WER 15.2 | 2628 21594 | 87.0 8.2 4.8 2.2 15.2 52.0 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
-# %WER 12.1 | 4459 42989 | 89.8 6.8 3.4 1.9 12.1 49.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys
-# %WER 8.5 | 1831 21395 | 92.7 5.1 2.1 1.3 8.5 44.0 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_8_1.0/eval2000_hires.ctm.swbd.filt.sys
-# %WER 15.0 | 2628 21594 | 87.2 8.1 4.7 2.2 15.0 51.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
-# %WER 11.9 | 4459 42989 | 90.0 6.7 3.3 1.8 11.9 48.6 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys
-# %WER 8.5 | 1831 21395 | 92.7 5.0 2.3 1.2 8.5 43.7 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 16.0 | 2628 21594 | 86.3 8.7 5.0 2.3 16.0 53.8 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_6_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.3 | 4459 42989 | 89.3 6.6 4.1 1.6 12.3 49.4 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.3 | 1831 21395 | 92.8 4.8 2.4 1.1 8.3 41.8 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_tg/score_10_1.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 15.7 | 2628 21594 | 86.5 8.5 5.0 2.3 15.7 53.2 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_6_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 12.2 | 4459 42989 | 89.7 6.9 3.4 2.0 12.2 50.1 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_6_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.2 | 1831 21395 | 93.0 4.8 2.2 1.2 8.2 41.6 | exp/chain/blstm_6j_sp/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
 
 # ./local/chain/show_chain_rt03_wer.sh blstm_6j_sp
-# %WER 10.1 | 3970 36721 | 91.1 5.3 3.6 1.2 10.1 43.8 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
-# %WER 11.9 | 8420 76157 | 89.6 6.6 3.8 1.5 11.9 45.2 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
-# %WER 13.5 | 4450 39436 | 88.2 7.9 3.9 1.8 13.5 46.4 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.swbd.filt.sys
-# %WER 9.7 | 3970 36721 | 91.5 5.1 3.5 1.2 9.7 43.4 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
-# %WER 11.6 | 8420 76157 | 89.9 6.5 3.6 1.5 11.6 44.7 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.filt.sys
-# %WER 13.3 | 4450 39436 | 88.5 7.7 3.8 1.8 13.3 45.8 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.9 | 3970 36721 | 91.3 5.3 3.4 1.2 9.9 43.6 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.7 | 8420 76157 | 89.6 6.3 4.1 1.3 11.7 44.7 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.3 | 4450 39436 | 88.2 7.5 4.3 1.5 13.3 45.3 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 9.7 | 3970 36721 | 91.4 5.2 3.4 1.1 9.7 43.1 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.5 | 8420 76157 | 89.8 6.2 4.0 1.3 11.5 44.3 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.2 | 4450 39436 | 88.3 7.3 4.3 1.5 13.2 45.1 | exp/chain/blstm_6j_sp/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
 
 
 set -e
@@ -140,14 +146,14 @@ if [ $stage -le 12 ]; then
   fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
 
   # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
-  lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
-  lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+  fast-lstmp-layer name=blstm1-forward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  fast-lstmp-layer name=blstm1-backward input=lda cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
 
-  lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
-  lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+  fast-lstmp-layer name=blstm2-forward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  fast-lstmp-layer name=blstm2-backward input=Append(blstm1-forward, blstm1-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
 
-  lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
-  lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
+  fast-lstmp-layer name=blstm3-forward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3
+  fast-lstmp-layer name=blstm3-backward input=Append(blstm2-forward, blstm2-backward) cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=3
 
   ## adding the layers for chain branch
   output-layer name=output input=Append(blstm3-forward, blstm3-backward) output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
index d057470552f..bccd61533d2 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
@@ -7,6 +7,7 @@
 # I just apply renorm component in TDNN layers.
 # ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp
 # System                tdnn_lstm_1a_sp
+# num-params                 39.7M
 # WER on eval2000(tg)        12.3
 #           [looped:]        12.2
 # WER on eval2000(fg)        12.1
@@ -20,6 +21,11 @@
 # Final train prob (xent)        -0.882
 # Final valid prob (xent)       -0.9393
 
+# ./steps/info/chain_dir_info.pl exp/chain/tdnn_lstm_1a_sp
+#exp/chain/tdnn_lstm_1a_sp: num-iters=2384 nj=3..16 num-params=39.7M dim=40+100->6149 combine=-0.097->-0.086 
+#xent:train/valid[1587,2383,final]=(-0.949,-0.898,-0.882/-0.998,-0.949,-0.939) 
+#logprob:train/valid[1587,2383,final]=(-0.079,-0.075,-0.074/-0.087,-0.082,-0.084)
+
 # ./show_chain_wer.sh tdnn_lstm_1a_sp
 # %WER 16.0 | 2628 21594 | 86.3 9.0 4.7 2.3 16.0 54.4 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
 # %WER 12.3 | 4459 42989 | 89.4 7.1 3.5 1.7 12.3 49.8 | exp/chain/tdnn_lstm_1a_sp/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
new file mode 100644
index 00000000000..2272f746ab3
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
@@ -0,0 +1,310 @@
+#!/bin/bash
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
+
+# Similar to swbd\s5c\local\chain\tuning\run_tdnn_lstm_1e.sh
+# Difference between tdnn_lstm_1a and tdnn_lstm_1b:
+# chunk width        150  140,100,160
+# xent_regularize    0.025  0.01
+# minibatch          64  64,32
+# frames-per-iter    1200000  1500000
+# batchnorm in TDNN  No  Yes
+# Dropout in LSTM    No  Yes
+
+# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_lstm_1b_sp
+# System                tdnn_lstm_1a_sp tdnn_lstm_1b_sp
+# num-params                 39.7M     39.7M
+# WER on eval2000(tg)        12.3      12.3
+#           [looped:]        12.2      12.3
+# WER on eval2000(fg)        12.1      12.0
+#           [looped:]        12.1      12.2
+# WER on rt03(tg)            11.6      11.4
+#           [looped:]        11.6      11.6
+# WER on rt03(fg)            11.3      11.1
+#           [looped:]        11.3      11.3
+# Final train prob         -0.074    -0.087
+# Final valid prob         -0.084    -0.088
+# Final train prob (xent)        -0.882    -1.015
+# Final valid prob (xent)       -0.9393   -0.9837
+
+#./steps/info/chain_dir_info.pl exp/chain/tdnn_lstm_1b_sp
+#exp/chain/tdnn_lstm_1b_sp: num-iters=1909 nj=3..16 num-params=39.7M dim=40+100->6149 combine=-0.087->-0.086 (over 5) 
+#xent:train/valid[1270,1908,final]=(-1.37,-1.02,-1.01/-1.31,-1.00,-0.984) 
+#logprob:train/valid[1270,1908,final]=(-0.108,-0.088,-0.087/-0.103,-0.091,-0.088)
+
+
+# online results
+# Eval2000
+#%WER 15.9 | 2628 21594 | 86.0 8.6 5.4 1.9 15.9 53.5 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 12.3 | 4459 42989 | 89.1 6.8 4.1 1.5 12.3 49.2 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+#%WER 8.6 | 1831 21395 | 92.5 5.2 2.3 1.1 8.6 42.6 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_tg/score_8_1.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 15.7 | 2628 21594 | 86.2 8.5 5.3 1.9 15.7 53.0 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 12.1 | 4459 42989 | 89.3 6.6 4.0 1.5 12.1 48.4 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_8_0.0/eval2000_hires.ctm.filt.sys
+#%WER 8.5 | 1831 21395 | 92.5 4.9 2.5 1.0 8.5 41.1 | exp/chain/tdnn_lstm_1b_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# online results
+# RT03
+#%WER 9.4 | 3970 36721 | 91.4 5.0 3.5 0.9 9.4 39.5 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+#%WER 11.6 | 8420 76157 | 89.5 6.4 4.1 1.1 11.6 42.0 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.filt.sys
+#%WER 13.5 | 4450 39436 | 87.6 7.3 5.0 1.1 13.5 44.5 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.swbd.filt.sys
+#%WER 9.2 | 3970 36721 | 91.6 4.9 3.5 0.9 9.2 39.3 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+#%WER 11.3 | 8420 76157 | 89.8 6.2 4.0 1.1 11.3 41.6 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+#%WER 13.2 | 4450 39436 | 88.0 7.4 4.6 1.2 13.2 43.6 | exp/chain/tdnn_lstm_1b_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_lstm_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+
+# training options
+leftmost_questions_truncate=-1
+frames_per_chunk=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+build_tree_train_set=train_nodup
+train_set=train_nodup_sp
+build_tree_ali_dir=exp/tri5a_ali
+treedir=exp/chain/tri6_tree
+lang=data/lang_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup$suffix
+  rm exp/tri5a_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  lstm_opts="decay-time=20 dropout-proportion=0.0"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=fastlstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  fast-lstmp-layer name=fastlstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=fastlstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=fastlstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk  "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=false
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
index 2de8d774451..737e0571b07 100644
--- a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
@@ -8,21 +8,45 @@
 # and renorm in its recurrence. Experiments show that the TDNN-NormOPGRU could achieve similar
 # results than TDNN-LSTMP and BLSTMP in both large or small data sets (80 ~ 2300 Hrs).
 
-# ./local/chain/compare_wer_general.sh --looped tdnn_lstm_1a_sp tdnn_opgru_1a_sp
-# System                tdnn_lstm_1a_sp tdnn_opgru_1a_sp
-# WER on eval2000(tg)        12.3      11.6
-#           [looped:]        12.2      11.7
-# WER on eval2000(fg)        12.1      11.6
-#           [looped:]        12.1      11.6
-# WER on rt03(tg)            11.6      10.9
-#           [looped:]        11.6      10.9
-# WER on rt03(fg)            11.3      10.7
-#           [looped:]        11.3      10.7
-# Final train prob         -0.074    -0.087
-# Final valid prob         -0.084    -0.092
-# Final train prob (xent)        -0.882    -1.007
-# Final valid prob (xent)       -0.9393   -1.0350
-
+# ./local/chain/compare_wer_general.sh tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp
+# num parameter         39.7M           39.7M           34.9M
+# System                tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp
+# WER on eval2000(tg)        12.3      12.3      11.7
+#           [looped:]        12.2      12.3      11.6
+# WER on eval2000(fg)        12.1      12.0      11.7
+#           [looped:]        12.1      12.2      11.6
+# WER on rt03(tg)            11.6      11.4      11.0
+#           [looped:]        11.6      11.6      11.0
+# WER on rt03(fg)            11.3      11.1      10.7
+#           [looped:]        11.3      11.3      10.8
+# Final train prob         -0.074    -0.087    -0.085
+# Final valid prob         -0.084    -0.088    -0.093
+# Final train prob (xent)        -0.882    -1.015    -0.972
+# Final valid prob (xent)       -0.9393   -0.9837   -1.0275
+
+#./steps/info/chain_dir_info.pl exp/chain/tdnn_opgru_1a_sp
+#exp/chain/tdnn_opgru_1a_sp: num-iters=2384 nj=3..16 num-params=34.9M dim=40+100->6149 combine=-0.096->-0.095 (over 8) 
+#xent:train/valid[1587,2383,final]=(-1.46,-0.960,-0.972/-1.49,-1.02,-1.03) 
+#logprob:train/valid[1587,2383,final]=(-0.114,-0.086,-0.085/-0.114,-0.094,-0.093)
+
+# online results
+# Eval2000
+# %WER 14.7 | 2628 21594 | 87.3 8.5 4.2 2.0 14.7 50.8 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.7 | 4459 42989 | 89.9 7.0 3.1 1.7 11.7 48.1 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.3 | 1831 21395 | 92.7 4.9 2.4 1.0 8.3 42.2 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+# %WER 14.7 | 2628 21594 | 87.4 8.5 4.1 2.1 14.7 50.5 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+# %WER 11.6 | 4459 42989 | 90.1 6.9 3.0 1.7 11.6 47.6 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys
+# %WER 8.1 | 1831 21395 | 92.9 4.8 2.3 1.1 8.1 41.8 | exp/chain/tdnn_opgru_1a_sp_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# online results
+# RT03
+# %WER 8.9 | 3970 36721 | 92.1 5.3 2.5 1.1 8.9 37.3 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 11.0 | 8420 76157 | 90.1 6.1 3.8 1.1 11.0 41.0 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_9_0.0/rt03_hires.ctm.filt.sys
+# %WER 13.0 | 4450 39436 | 88.3 7.7 4.0 1.3 13.0 43.1 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+# %WER 8.6 | 3970 36721 | 92.4 4.9 2.8 1.0 8.6 37.2 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+# %WER 10.8 | 8420 76157 | 90.4 6.2 3.4 1.2 10.8 40.0 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+# %WER 12.8 | 4450 39436 | 88.6 7.5 4.0 1.4 12.8 42.5 | exp/chain/tdnn_opgru_1a_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+ 
 
 set -e
 
@@ -125,7 +149,7 @@ if [ $stage -le 12 ]; then
 
   num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
   learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  lstm_opts="dropout-per-frame=true dropout-proportion=0.0 "
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
 
   mkdir -p $dir/configs
   cat <<EOF > $dir/configs/network.xconfig
@@ -143,13 +167,13 @@ if [ $stage -le 12 ]; then
   relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
 
   # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
-  norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
   relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
   relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
-  norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
   relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
   relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
-  norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts
+  norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
 
   ## adding the layers for chain branch
   output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
new file mode 100644
index 00000000000..762db86a8cf
--- /dev/null
+++ b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
@@ -0,0 +1,308 @@
+#!/bin/bash
+# Copyright 2017 University of Chinese Academy of Sciences (UCAS) Gaofeng Cheng
+# Apache 2.0
+
+# Similar to tdnn_lstm_1e (from egs/swbd/s5c). 
+# Difference between tdnn_opgru_1a and tdnn_opgru_1b:
+# chunk width        150  140,100,160
+# xent_regularize    0.025 0.01
+# minibatch          64   64,32
+# frames-per-iter    1200000 1500000
+
+# ./local/chain/compare_wer_general.sh tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# num parameter              39.7M           39.7M           34.9M            34.9M
+# System                     tdnn_lstm_1a_sp tdnn_lstm_1b_sp tdnn_opgru_1a_sp tdnn_opgru_1b_sp
+# WER on eval2000(tg)        12.3      12.3      11.7      12.2
+#           [looped:]        12.2      12.3      11.6      12.1
+# WER on eval2000(fg)        12.1      12.0      11.7      12.0
+#           [looped:]        12.1      12.2      11.6      11.9
+# WER on rt03(tg)            11.6      11.4      11.0      11.3
+#           [looped:]        11.6      11.6      11.0      11.3
+# WER on rt03(fg)            11.3      11.1      10.7      11.1
+#           [looped:]        11.3      11.3      10.8      11.0
+# Final train prob         -0.074    -0.087    -0.085    -0.097
+# Final valid prob         -0.084    -0.088    -0.093    -0.093
+# Final train prob (xent)        -0.882    -1.015    -0.972    -1.121
+# Final valid prob (xent)       -0.9393   -0.9837   -1.0275   -1.0703
+
+
+#./steps/info/chain_dir_info.pl exp/chain/tdnn_opgru_1b_sp
+# exp/chain/tdnn_opgru_1b_sp: num-iters=1807 nj=3..16 num-params=34.9M dim=40+100->6149 combine=-0.102->-0.101 (over 5) 
+# xent:train/valid[1202,1806,final]=(-1.70,-1.11,-1.12/-1.63,-1.06,-1.07) 
+# logprob:train/valid[1202,1806,final]=(-0.131,-0.098,-0.097/-0.123,-0.094,-0.093)
+
+# online results
+# Eval2000
+#%WER 15.7 | 2628 21594 | 86.2 8.5 5.3 2.0 15.7 53.2 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 12.2 | 4459 42989 | 89.3 6.7 4.0 1.5 12.2 48.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_8_0.0/eval2000_hires.ctm.filt.sys
+#%WER 8.5 | 1831 21395 | 92.6 5.0 2.4 1.0 8.5 41.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_tg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+#%WER 15.6 | 2628 21594 | 86.4 8.3 5.3 2.0 15.6 52.5 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.callhm.filt.sys
+#%WER 12.1 | 4459 42989 | 89.5 6.8 3.6 1.6 12.1 47.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_7_0.0/eval2000_hires.ctm.filt.sys
+#%WER 8.4 | 1831 21395 | 92.7 4.9 2.4 1.1 8.4 41.3 | exp/chain/tdnn_opgru_1b_sp_online/decode_eval2000_fsh_sw1_fg/score_10_0.0/eval2000_hires.ctm.swbd.filt.sys
+
+# online results
+# RT03
+#%WER 9.1 | 3970 36721 | 91.8 5.3 2.9 0.9 9.1 37.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_7_1.0/rt03_hires.ctm.fsh.filt.sys
+#%WER 11.4 | 8420 76157 | 89.7 6.8 3.5 1.2 11.4 40.6 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_7_0.0/rt03_hires.ctm.filt.sys
+#%WER 13.4 | 4450 39436 | 87.8 7.8 4.4 1.2 13.4 43.6 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_tg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+#%WER 8.9 | 3970 36721 | 92.0 5.0 3.0 0.9 8.9 37.7 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.fsh.filt.sys
+#%WER 11.1 | 8420 76157 | 90.0 6.3 3.7 1.1 11.1 40.4 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.filt.sys
+#%WER 13.2 | 4450 39436 | 88.1 7.5 4.4 1.3 13.2 42.9 | exp/chain/tdnn_opgru_1b_sp_online/decode_rt03_fsh_sw1_fg/score_8_0.0/rt03_hires.ctm.swbd.filt.sys
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_opgru_1b # Note: _sp will get added to this if $speed_perturb == true.
+decode_iter=
+decode_dir_affix=
+dropout_schedule='0,0@0.20,0.2@0.50,0'
+
+# training options
+leftmost_questions_truncate=-1
+frames_per_chunk=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.01
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+
+remove_egs=false
+common_egs_dir=
+
+affix=
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+build_tree_train_set=train_nodup
+train_set=train_nodup_sp
+build_tree_ali_dir=exp/tri5a_ali
+treedir=exp/chain/tri6_tree
+lang=data/lang_chain
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat $build_tree_ali_dir/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri5a exp/tri5a_lats_nodup$suffix
+  rm exp/tri5a_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 11000 data/$build_tree_train_set $lang $build_tree_ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  gru_opts="dropout-per-frame=true dropout-proportion=0.0 "
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2, ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=1024
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  norm-opgru-layer name=opgru1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024
+  norm-opgru-layer name=opgru3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $gru_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=opgru3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=opgru3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri5a_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_fsh_sw1_tg $dir $dir/graph_fsh_sw1_tg
+fi
+
+decode_suff=fsh_sw1_tg
+graph_dir=$dir/graph_fsh_sw1_tg
+if [ $stage -le 15 ]; then
+  [ -z $extra_left_context ] && extra_left_context=$chunk_left_context;
+  [ -z $extra_right_context ] && extra_right_context=$chunk_right_context;
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in rt03 eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context  \
+          --extra-right-context $extra_right_context  \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+
+test_online_decoding=true
+lang=data/lang_fsh_sw1_tg
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in rt03 eval2000; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_${decode_suff} || exit 1;
+      if $has_fisher; then
+	      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+		      data/lang_fsh_sw1_{tg,fg} data/${decode_set}_hires \
+		      ${dir}_online/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_fsh_sw1_{tg,fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;
diff --git a/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh b/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh
deleted file mode 100644
index ce693c8ad56..00000000000
--- a/egs/fisher_swbd/s5/local/chain/show_chain_wer.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.callhm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_tg/score*/eval2000_hires.ctm.swbd.filt.sys  | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.callhm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_eval2000_fsh_sw1_fg/score*/eval2000_hires.ctm.swbd.filt.sys | utils/best_wer.sh
-done
diff --git a/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh b/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh
deleted file mode 100644
index 6aca067a84c..00000000000
--- a/egs/fisher_swbd/s5/local/chain/show_chain_wer_rt03.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.fsh.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_tg/score*/rt03_hires.ctm.swbd.filt.sys  | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.fsh.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.filt.sys | grep -v swbd | utils/best_wer.sh
-done
-for l in $*; do
-  grep Sum exp/chain/${1}/decode_rt03_fsh_sw1_fg/score*/rt03_hires.ctm.swbd.filt.sys | utils/best_wer.sh
-done

From 8596bbf5488d2339908f6eb73ffa2d7654711ab4 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Tue, 23 Jan 2018 14:14:39 -0500
Subject: [PATCH 076/184] [egs] removing broken link in babel multilang setup
 (#2177)

---
 egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh

diff --git a/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh b/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh
deleted file mode 120000
index 5065f95a98c..00000000000
--- a/egs/babel_multilang/s5/local/datasets/unsupervised_uem.sh
+++ /dev/null
@@ -1 +0,0 @@
-../../../../babel/s5d/local/datasets/unsupervised_uem.sh
\ No newline at end of file

From 476cb3f6b37c4146057ea5b1f916469fdd2f2273 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Tue, 23 Jan 2018 14:37:20 -0500
Subject: [PATCH 077/184] [egs] fixes for the IAM example (#2176)

---
 egs/iam/v1/local/check_tools.sh  | 45 ++++++++++++++++++++++++++++++++
 egs/iam/v1/local/prepare_data.sh |  1 +
 egs/iam/v1/local/prepare_dict.sh |  2 +-
 egs/iam/v1/run.sh                |  9 +++++--
 4 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100755 egs/iam/v1/local/check_tools.sh

diff --git a/egs/iam/v1/local/check_tools.sh b/egs/iam/v1/local/check_tools.sh
new file mode 100755
index 00000000000..aa4fe70fa64
--- /dev/null
+++ b/egs/iam/v1/local/check_tools.sh
@@ -0,0 +1,45 @@
+#!/bin/bash -u
+
+# Copyright 2015 (c) Johns Hopkins University (Jan Trmal <jtrmal@gmail.com>)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+[ -f ./path.sh ] && . ./path.sh
+set +e
+
+command -v python3 2>/dev/null \
+  || { echo  >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; }
+
+python3 -c "import numpy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs numpy installed."
+  exit 1
+fi
+
+python3 -c "import scipy"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy installed."
+  exit 1
+fi
+
+python3 -c "import scipy.misc; scipy.misc.__dict__['imread']"
+if [ $? -ne 0 ] ; then
+  echo >&2 "This recipe needs scipy-image and  Pillow installed."
+  exit 1
+fi
+
+
+exit  0
+
+
diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh
index 1350c5841df..e751d5ff71a 100755
--- a/egs/iam/v1/local/prepare_data.sh
+++ b/egs/iam/v1/local/prepare_data.sh
@@ -35,6 +35,7 @@ if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then
   echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database"
   echo "... and then call this script again with --username <username> --password <password>"
   echo ""
+  exit 1
 fi
 
 lines=data/local/lines
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index 77a46df384f..0c3bb325023 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -37,7 +37,7 @@ while(<>){ @A = split;
 }' | sort > $dir/lexicon.txt
 
 
-sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
+sed -i '' "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
index c8ebb9ae649..d5f66ca4110 100755
--- a/egs/iam/v1/run.sh
+++ b/egs/iam/v1/run.sh
@@ -7,7 +7,8 @@
 set -e
 stage=0
 nj=20
-
+username=
+password=
 # iam_database points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # like "data/download" and follow the instructions
@@ -20,9 +21,13 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
 
+
+./local/check_tools.sh
+
 if [ $stage -le 0 ]; then
   echo "$0: Preparing data..."
-  local/prepare_data.sh --download-dir "$iam_database"
+  local/prepare_data.sh --download-dir "$iam_database" \
+    --username "$username" --password "$password"
 fi
 mkdir -p data/{train,test}/data
 

From 49b1562edba0fb470a61a8b09dea48e6a6cc0fd1 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 23 Jan 2018 16:57:04 -0500
Subject: [PATCH 078/184] [egs] Add slightly fixed example.

---
 .../s5c/local/chain/tuning/run_tdnn_7m19h.sh  |  14 +-
 .../s5c/local/chain/tuning/run_tdnn_7m23h.sh  | 519 ++++++++++++++++++
 .../local/chain/tuning/run_tdnn_lstm_1m.sh    |   5 +-
 3 files changed, 530 insertions(+), 8 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
index b509517da68..9ce9a790e2f 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
@@ -3,13 +3,13 @@
 # 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
 
 # local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
+# System                tdnn7m19e_sp tdnn7m19h_sp  [rerun of 17m19h:]
+# WER on train_dev(tg)      12.75     12.65     12.61
+# WER on train_dev(fg)      11.77     11.57     11.72
+# WER on eval2000(tg)        15.5      15.3     15.4
+# WER on eval2000(fg)        14.0      13.7     13.7
+# WER on rt03(tg)            18.9      18.8     18.9
+# WER on rt03(fg)            16.4      16.4     16.3
 # Final train prob         -0.092    -0.091
 # Final valid prob         -0.102    -0.102
 # Final train prob (xent)        -1.094    -1.091
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh
new file mode 100755
index 00000000000..7761cb1c24e
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh
@@ -0,0 +1,519 @@
+#!/bin/bash
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m23h
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1536
+  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn2l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1536
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
+  linear-component name=tdnn4l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l)
+  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 4 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
index 4b2c93082d9..b50692616c4 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m.sh
@@ -11,6 +11,9 @@
 #WER on train_dev(fg)         11.59             11.46           11.41
 #WER on eval2000(tg)          14.8              14.8            14.9
 #WER on eval2000(fg)          13.5              13.5            13.6
+# WER on rt03(tg)                               18.6
+# WER on rt03(fg)                               16.3
+
 #Final train prob             -0.069            -0.081
 #Final valid prob             -0.095            -0.100
 #Final train prob (xent)      -0.913            -0.950
@@ -258,7 +261,7 @@ if $test_online_decoding && [ $stage -le 16 ]; then
        $lang exp/nnet3/extractor $dir ${dir}_online
 
   rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 rt03; do
+  for decode_set in train_dev eval2000 $maybe_rt03; do
     (
       # note: we just give it "$decode_set" as it only uses the wav.scp, the
       # feature type does not matter.

From 4bb2c5c2f3cc5b45db8db7f4150f4b7c0fa3a483 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 23 Jan 2018 18:28:55 -0500
Subject: [PATCH 079/184] [src] Speed fix to online decoding (thanks: David van
 Leeuwen)

---
 src/nnet3/decodable-simple-looped.cc | 19 ++++++-----
 src/nnet3/nnet-compile-looped.cc     | 50 ++++++++++++++++++----------
 src/nnet3/nnet-compile-looped.h      | 34 ++++++++++++++++---
 3 files changed, 72 insertions(+), 31 deletions(-)

diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index df18d605b7d..d4edb440d5a 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -60,9 +60,10 @@ void DecodableNnetSimpleLoopedInfo::Init(
   KALDI_ASSERT(IsSimpleNnet(*nnet));
   has_ivectors = (nnet->InputDim("ivector") > 0);
   int32 left_context, right_context;
+  int32 extra_right_context = 0;
   ComputeSimpleNnetContext(*nnet, &left_context, &right_context);
   frames_left_context = left_context + opts.extra_left_context_initial;
-  frames_right_context = right_context;
+  frames_right_context = right_context + extra_right_context;
   frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor,
                                   opts.frames_per_chunk);
   output_dim = nnet->OutputDim("output");
@@ -73,14 +74,14 @@ void DecodableNnetSimpleLoopedInfo::Init(
     ModifyNnetIvectorPeriod(ivector_period, nnet);
 
   int32 num_sequences = 1;  // we're processing one utterance at a time.
-  int32 extra_right_context = 0;
-  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk,
-                                       opts.frame_subsampling_factor,
-                                       ivector_period,
-                                       opts.extra_left_context_initial,
-                                       extra_right_context,
-                                       num_sequences,
-                                       &request1, &request2, &request3);
+
+  CreateLoopedComputationRequest(*nnet, frames_per_chunk,
+                                 opts.frame_subsampling_factor,
+                                 ivector_period,
+                                 frames_left_context,
+                                 frames_right_context,
+                                 num_sequences,
+                                 &request1, &request2, &request3);
 
   CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
                 &computation);
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index b0ca42f15ab..fa8a2322e5a 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -150,26 +150,24 @@ static void CreateComputationRequestInternal(
 }
 
 
-void CreateLoopedComputationRequestSimple(const Nnet &nnet,
-                                          int32 chunk_size,
-                                          int32 frame_subsampling_factor,
-                                          int32 ivector_period,
-                                          int32 extra_left_context_begin,
-                                          int32 extra_right_context,
-                                          int32 num_sequences,
-                                          ComputationRequest *request1,
-                                          ComputationRequest *request2,
-                                          ComputationRequest *request3) {
+void CreateLoopedComputationRequest(const Nnet &nnet,
+                                    int32 chunk_size,
+                                    int32 frame_subsampling_factor,
+                                    int32 ivector_period,
+                                    int32 left_context_begin,
+                                    int32 right_context,
+                                    int32 num_sequences,
+                                    ComputationRequest *request1,
+                                    ComputationRequest *request2,
+                                    ComputationRequest *request3) {
   bool has_ivector = (nnet.InputDim("ivector") > 0);
-  int32 left_context, right_context;
-  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
   KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 &&
                chunk_size % nnet.Modulus() == 0 &&
                chunk_size % ivector_period == 0);
-  KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0);
+  KALDI_ASSERT(left_context_begin >= 0 && right_context >= 0);
   // note, 'end' is one past the last one.
-  int32 chunk1_input_begin_t = - left_context - extra_left_context_begin,
-      chunk1_input_end_t = chunk_size + right_context + extra_right_context,
+  int32 chunk1_input_begin_t = - left_context_begin,
+      chunk1_input_end_t = chunk_size + right_context,
       chunk2_input_begin_t = chunk1_input_end_t,
       chunk2_input_end_t = chunk2_input_begin_t + chunk_size,
       chunk3_input_begin_t = chunk2_input_end_t,
@@ -349,10 +347,26 @@ void CompileLooped(const Nnet &nnet,
 }
 
 
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3) {
+  bool has_ivector = (nnet.InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
 
-
-
-
+  CreateLoopedComputationRequest(nnet, chunk_size, frame_subsampling_factor,
+                                 ivector_period,
+                                 extra_left_context_begin + left_context,
+                                 extra_right_context + right_context,
+                                 num_sequences, request1, request2, request3);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h
index 2ebb371ecc5..7c1bb655c42 100644
--- a/src/nnet3/nnet-compile-looped.h
+++ b/src/nnet3/nnet-compile-looped.h
@@ -132,10 +132,17 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
                'nnet' before calling this function; otherwise the neural net
                will most likely not actually be able to consume the iVector with
                this frequency.
-   @param [in] extra_left_context_begin  The additional left-context that
-               should be supplied to the network on top of the minimum
-               that the network requires.  We call this extra_left_context_begin
-               because this only relates to the start of the utterance (t=0).
+   @param [in] left_context_begin This should be the left-context of the network
+               plus any additional left-context (provided via the option
+               --extra-left-context-begin) that should be supplied to the
+               network on top of the minimum that the network requires.  We call
+               this left_context_begin because this only relates to the
+               start of the utterance (t=0).
+   @param [in] right_context This should be the right-context of the network,
+               plus any additional right-context ("extra-right-context") that
+               should be supplied to the network on top of the minimum that the
+               network requires (currently extra-right-context != 0 is is not
+               supported at the command-line level).
    @param [in] num_sequences  The number of separate 'n' values to put in the computation;
                normally this will be just 1, but it can be increased to allow
                simultaneous operation on multiple streams of input.
@@ -152,6 +159,25 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
    @param [out] request3  The third of the 3 requests that this function generates.
                 It will be the same as request2, except for a time offset.
 */
+void CreateLoopedComputationRequest(const Nnet &nnet,
+                                    int32 chunk_size,
+                                    int32 frame_subsampling_factor,
+                                    int32 ivector_period,
+                                    int32 left_context_begin,
+                                    int32 right_context,
+                                    int32 num_sequences,
+                                    ComputationRequest *request1,
+                                    ComputationRequest *request2,
+                                    ComputationRequest *request3);
+
+
+/**
+   This function is deprecated.  It has the same interface as
+   CreateLoopedComputationRequest(), except that the left and right context are
+   specified in a different way (as just the 'extra' part).  It is deprecated because
+   this function has to work out the left and right context of the network, which
+   turns out to be quite slow if it's done after you call ModifyNnetIvectorPeriod().
+*/
 void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           int32 chunk_size,
                                           int32 frame_subsampling_factor,

From b73bb12e41e2843a3fd0c660b15506a3bc985a7d Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Wed, 24 Jan 2018 04:02:03 +0330
Subject: [PATCH 080/184] [egs] Make sure scoring opts are passed to
 score_cer.sh in UW3 (#2181)

---
 egs/uw3/v1/local/score.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/egs/uw3/v1/local/score.sh b/egs/uw3/v1/local/score.sh
index 9ea4701a833..3a6aeaa08ad 100755
--- a/egs/uw3/v1/local/score.sh
+++ b/egs/uw3/v1/local/score.sh
@@ -143,7 +143,9 @@ if [ $stage -le 1 ]; then
   fi
 fi
 
-steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir
+steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \
+                                 --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \
+                                 $data $lang_or_graph $dir
 
 # If we got here, the scoring was successful.
 # As a  small aid to prevent confusion, we remove all wer_{?,??} files;

From 47de1452869f9128c6fa34bb3f1d06c503e4ef2d Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Tue, 23 Jan 2018 17:37:53 -0800
Subject: [PATCH 081/184] [scripts] Fix typos in scripts (#2182)

---
 egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh | 4 ++--
 egs/wsj/s5/steps/nnet2/get_ivector_id.sh            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
index dcfa6cf59b8..7c5d3a3254d 100755
--- a/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
+++ b/egs/wsj/s5/steps/nnet2/check_ivectors_compatible.sh
@@ -7,8 +7,8 @@
 
 #echo >&2 "$0 $@"  # Print the command line for logging
 if [ $# != 2 ] ; then
-  echo >$2 "Usage: $0  <first-dir> <second-dir>"
-  echo >$2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
+  echo >&2 "Usage: $0  <first-dir> <second-dir>"
+  echo >&2 " e.g.: $0 exp/nnet3/extractor exp/nnet3/ivectors_dev10h.pem"
 fi
 
 dir_a=$1
diff --git a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
index 7adcfac11c7..1ec60d1f514 100755
--- a/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
+++ b/egs/wsj/s5/steps/nnet2/get_ivector_id.sh
@@ -16,8 +16,8 @@ if [ -f path.sh ]; then . ./path.sh; fi
 
 
 if [ $# != 1 ]; then
-  echo >$2 "Usage: $0 <directory>"
-  echo >$2 " e.g.: $0 exp/nnet3/extractor"
+  echo >&2 "Usage: $0 <directory>"
+  echo >&2 " e.g.: $0 exp/nnet3/extractor"
   exit 1
 fi
 

From 2e105fc455246540e53f64be19060d594abfc83f Mon Sep 17 00:00:00 2001
From: Gaofeng Cheng <770579626@qq.com>
Date: Wed, 24 Jan 2018 10:08:35 +0800
Subject: [PATCH 082/184] [scripts] Fix to get_num_frames.sh for large
 datasets, RE truncation in awk (#2174)

---
 egs/wsj/s5/utils/data/get_num_frames.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/data/get_num_frames.sh b/egs/wsj/s5/utils/data/get_num_frames.sh
index f3589b2eb06..996468631fa 100755
--- a/egs/wsj/s5/utils/data/get_num_frames.sh
+++ b/egs/wsj/s5/utils/data/get_num_frames.sh
@@ -22,4 +22,4 @@ fi
 
 frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1
 
-awk -v s=$frame_shift '{n += $2} END{printf("%d\n", int(n / s))}' <$data/utt2dur
+awk -v s=$frame_shift '{n += $2} END{printf("%.0f\n", (n / s))}' <$data/utt2dur

From 7cf434ce1267cfed3101ee723aad7f9e7c42aebb Mon Sep 17 00:00:00 2001
From: Szu-JuiChen <31828751+Szu-JuiChen@users.noreply.github.com>
Date: Tue, 23 Jan 2018 21:41:53 -0500
Subject: [PATCH 083/184] [scripts] Fix python3 compatibility  bug (#2184)

---
 egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
index 7b91b905c3a..56b9f69b3c9 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
+++ b/egs/wsj/s5/steps/diagnostic/analyze_lattice_depth_stats.py
@@ -68,7 +68,7 @@
 phone_depth_counts = dict()
 
 # note: -1 is for all phones put in one bucket.
-for p in [ -1 ] + phone_int2text.keys():
+for p in [ -1 ] + list(phone_int2text.keys()):
     phone_depth_counts[p] = defaultdict(int)
 
 total_frames = 0

From ed84a510745cb00630138b8e1f19538d29536728 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Wed, 24 Jan 2018 13:32:40 -0500
Subject: [PATCH 084/184] [scripts] Prevent crash when input_model is None
 (#2188)

---
 egs/wsj/s5/steps/nnet3/chain/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index b62f5510e3c..6896da67f73 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -218,7 +218,7 @@ def process_args(args):
 
     if (not os.path.exists(args.dir)
             or (not os.path.exists(args.dir+"/configs") and
-                not os.path.exists(args.input_model))):
+                (args.input_model is None or not os.path.exists(args.input_model)))):
         raise Exception("This script expects {0} to exist. Also either "
                         "--trainer.input-model option as initial 'raw' model "
                         "(used as 0.raw in the script) should be supplied or "

From 6fed4c7dd0b64f82611001793e383ab657eb3a65 Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Wed, 24 Jan 2018 10:33:14 -0800
Subject: [PATCH 085/184] [src] Remove CuDevice destructor (avoid cuda-memcheck
 warnings) (#2185)

---
 src/cudamatrix/cu-device.cc | 8 --------
 src/cudamatrix/cu-device.h  | 1 -
 2 files changed, 9 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index ba0db7df08d..9b0976b05ad 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -547,14 +547,6 @@ CuDevice::CuDevice() :
     multi_threaded_(false) { }
 
 
-CuDevice::~CuDevice() {
-  if (Enabled()) {
-    cublasDestroy(handle_);
-    cusparseDestroy(cusparse_handle_);
-    cudaDeviceReset();
-  }
-}
-
 // The instance of the static singleton
 CuDevice CuDevice::global_device_;
 }
diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h
index c355549648b..99105355a8f 100644
--- a/src/cudamatrix/cu-device.h
+++ b/src/cudamatrix/cu-device.h
@@ -47,7 +47,6 @@ class CuTimer;
 class CuDevice {
  // Singleton object (there should only be one instantiated per program)
  public:
-  ~CuDevice();
   static inline CuDevice& Instantiate() { return global_device_; }
 
   inline cublasHandle_t GetHandle() { return handle_; }

From 7ee7893f9d1ece40029d2e8ec1bc91a9708c66b4 Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Wed, 24 Jan 2018 14:45:44 -0500
Subject: [PATCH 086/184] [src] Fix nnet3 back-compatibility issue (thx:
 @satmass)

---
 src/nnet3/nnet-simple-component.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 36fb0db520d..91906ac1ddf 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -361,6 +361,8 @@ void NormalizeComponent::Read(std::istream &is, bool binary) {
   if (token == "<AddLogStddev>") {
     ReadBasicType(is, binary, &add_log_stddev_);
     ReadToken(is, binary, &token);
+  } else {
+    add_log_stddev_ = false;
   }
   if (token == "<ValueAvg>") {
     // back-compatibility code.

From f861b00ca0087e7f265e5ce80cbe199ebf44d046 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 25 Jan 2018 12:42:33 -0500
Subject: [PATCH 087/184] [src] Speed fix to online decoding (thanks: David van
 Leeuwen) (#2180)

---
 src/nnet3/decodable-simple-looped.cc | 19 ++++++-----
 src/nnet3/nnet-compile-looped.cc     | 50 ++++++++++++++++++----------
 src/nnet3/nnet-compile-looped.h      | 34 ++++++++++++++++---
 src/nnet3bin/nnet3-am-copy.cc        | 13 +++++++-
 4 files changed, 84 insertions(+), 32 deletions(-)

diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index df18d605b7d..d4edb440d5a 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -60,9 +60,10 @@ void DecodableNnetSimpleLoopedInfo::Init(
   KALDI_ASSERT(IsSimpleNnet(*nnet));
   has_ivectors = (nnet->InputDim("ivector") > 0);
   int32 left_context, right_context;
+  int32 extra_right_context = 0;
   ComputeSimpleNnetContext(*nnet, &left_context, &right_context);
   frames_left_context = left_context + opts.extra_left_context_initial;
-  frames_right_context = right_context;
+  frames_right_context = right_context + extra_right_context;
   frames_per_chunk = GetChunkSize(*nnet, opts.frame_subsampling_factor,
                                   opts.frames_per_chunk);
   output_dim = nnet->OutputDim("output");
@@ -73,14 +74,14 @@ void DecodableNnetSimpleLoopedInfo::Init(
     ModifyNnetIvectorPeriod(ivector_period, nnet);
 
   int32 num_sequences = 1;  // we're processing one utterance at a time.
-  int32 extra_right_context = 0;
-  CreateLoopedComputationRequestSimple(*nnet, frames_per_chunk,
-                                       opts.frame_subsampling_factor,
-                                       ivector_period,
-                                       opts.extra_left_context_initial,
-                                       extra_right_context,
-                                       num_sequences,
-                                       &request1, &request2, &request3);
+
+  CreateLoopedComputationRequest(*nnet, frames_per_chunk,
+                                 opts.frame_subsampling_factor,
+                                 ivector_period,
+                                 frames_left_context,
+                                 frames_right_context,
+                                 num_sequences,
+                                 &request1, &request2, &request3);
 
   CompileLooped(*nnet, opts.optimize_config, request1, request2, request3,
                 &computation);
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index b0ca42f15ab..fa8a2322e5a 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -150,26 +150,24 @@ static void CreateComputationRequestInternal(
 }
 
 
-void CreateLoopedComputationRequestSimple(const Nnet &nnet,
-                                          int32 chunk_size,
-                                          int32 frame_subsampling_factor,
-                                          int32 ivector_period,
-                                          int32 extra_left_context_begin,
-                                          int32 extra_right_context,
-                                          int32 num_sequences,
-                                          ComputationRequest *request1,
-                                          ComputationRequest *request2,
-                                          ComputationRequest *request3) {
+void CreateLoopedComputationRequest(const Nnet &nnet,
+                                    int32 chunk_size,
+                                    int32 frame_subsampling_factor,
+                                    int32 ivector_period,
+                                    int32 left_context_begin,
+                                    int32 right_context,
+                                    int32 num_sequences,
+                                    ComputationRequest *request1,
+                                    ComputationRequest *request2,
+                                    ComputationRequest *request3) {
   bool has_ivector = (nnet.InputDim("ivector") > 0);
-  int32 left_context, right_context;
-  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
   KALDI_ASSERT(chunk_size % frame_subsampling_factor == 0 &&
                chunk_size % nnet.Modulus() == 0 &&
                chunk_size % ivector_period == 0);
-  KALDI_ASSERT(extra_left_context_begin >= 0 && extra_right_context >= 0);
+  KALDI_ASSERT(left_context_begin >= 0 && right_context >= 0);
   // note, 'end' is one past the last one.
-  int32 chunk1_input_begin_t = - left_context - extra_left_context_begin,
-      chunk1_input_end_t = chunk_size + right_context + extra_right_context,
+  int32 chunk1_input_begin_t = - left_context_begin,
+      chunk1_input_end_t = chunk_size + right_context,
       chunk2_input_begin_t = chunk1_input_end_t,
       chunk2_input_end_t = chunk2_input_begin_t + chunk_size,
       chunk3_input_begin_t = chunk2_input_end_t,
@@ -349,10 +347,26 @@ void CompileLooped(const Nnet &nnet,
 }
 
 
+void CreateLoopedComputationRequestSimple(const Nnet &nnet,
+                                          int32 chunk_size,
+                                          int32 frame_subsampling_factor,
+                                          int32 ivector_period,
+                                          int32 extra_left_context_begin,
+                                          int32 extra_right_context,
+                                          int32 num_sequences,
+                                          ComputationRequest *request1,
+                                          ComputationRequest *request2,
+                                          ComputationRequest *request3) {
+  bool has_ivector = (nnet.InputDim("ivector") > 0);
+  int32 left_context, right_context;
+  ComputeSimpleNnetContext(nnet, &left_context, &right_context);
 
-
-
-
+  CreateLoopedComputationRequest(nnet, chunk_size, frame_subsampling_factor,
+                                 ivector_period,
+                                 extra_left_context_begin + left_context,
+                                 extra_right_context + right_context,
+                                 num_sequences, request1, request2, request3);
+}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compile-looped.h b/src/nnet3/nnet-compile-looped.h
index 2ebb371ecc5..7c1bb655c42 100644
--- a/src/nnet3/nnet-compile-looped.h
+++ b/src/nnet3/nnet-compile-looped.h
@@ -132,10 +132,17 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
                'nnet' before calling this function; otherwise the neural net
                will most likely not actually be able to consume the iVector with
                this frequency.
-   @param [in] extra_left_context_begin  The additional left-context that
-               should be supplied to the network on top of the minimum
-               that the network requires.  We call this extra_left_context_begin
-               because this only relates to the start of the utterance (t=0).
+   @param [in] left_context_begin This should be the left-context of the network
+               plus any additional left-context (provided via the option
+               --extra-left-context-begin) that should be supplied to the
+               network on top of the minimum that the network requires.  We call
+               this left_context_begin because this only relates to the
+               start of the utterance (t=0).
+   @param [in] right_context This should be the right-context of the network,
+               plus any additional right-context ("extra-right-context") that
+               should be supplied to the network on top of the minimum that the
+               network requires (currently extra-right-context != 0 is is not
+               supported at the command-line level).
    @param [in] num_sequences  The number of separate 'n' values to put in the computation;
                normally this will be just 1, but it can be increased to allow
                simultaneous operation on multiple streams of input.
@@ -152,6 +159,25 @@ void ModifyNnetIvectorPeriod(int32 ivector_period,
    @param [out] request3  The third of the 3 requests that this function generates.
                 It will be the same as request2, except for a time offset.
 */
+void CreateLoopedComputationRequest(const Nnet &nnet,
+                                    int32 chunk_size,
+                                    int32 frame_subsampling_factor,
+                                    int32 ivector_period,
+                                    int32 left_context_begin,
+                                    int32 right_context,
+                                    int32 num_sequences,
+                                    ComputationRequest *request1,
+                                    ComputationRequest *request2,
+                                    ComputationRequest *request3);
+
+
+/**
+   This function is deprecated.  It has the same interface as
+   CreateLoopedComputationRequest(), except that the left and right context are
+   specified in a different way (as just the 'extra' part).  It is deprecated because
+   this function has to work out the left and right context of the network, which
+   turns out to be quite slow if it's done after you call ModifyNnetIvectorPeriod().
+*/
 void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           int32 chunk_size,
                                           int32 frame_subsampling_factor,
diff --git a/src/nnet3bin/nnet3-am-copy.cc b/src/nnet3bin/nnet3-am-copy.cc
index 5f697356dbf..2230ae77c00 100644
--- a/src/nnet3bin/nnet3-am-copy.cc
+++ b/src/nnet3bin/nnet3-am-copy.cc
@@ -50,6 +50,7 @@ int main(int argc, char *argv[]) {
     std::string set_raw_nnet = "";
     bool convert_repeated_to_block = false;
     BaseFloat scale = 1.0;
+    bool prepare_for_test = false;
     std::string nnet_config, edits_config, edits_str;
 
     ParseOptions po(usage);
@@ -81,7 +82,11 @@ int main(int argc, char *argv[]) {
                 " are set to this value.");
     po.Register("scale", &scale, "The parameter matrices are scaled"
                 " by the specified value.");
-
+    po.Register("prepare-for-test", &prepare_for_test,
+                "If true, prepares the model for test time (may reduce model size "
+                "slightly.  Involves setting test mode in dropout and batch-norm "
+                "components, and calling CollapseModel() which may remove some "
+                "components.");
 
     po.Read(argc, argv);
 
@@ -135,6 +140,12 @@ int main(int argc, char *argv[]) {
     if (scale != 1.0)
       ScaleNnet(scale, &(am_nnet.GetNnet()));
 
+    if (prepare_for_test) {
+      SetBatchnormTestMode(true, &am_nnet.GetNnet());
+      SetDropoutTestMode(true, &am_nnet.GetNnet());
+      CollapseModel(CollapseModelConfig(), &am_nnet.GetNnet());
+    }
+
     if (raw) {
       WriteKaldiObject(am_nnet.GetNnet(), nnet_wxfilename, binary_write);
       KALDI_LOG << "Copied neural net from " << nnet_rxfilename

From 7c8e1a3f68bc5840c1dbe0ebcb5c6670221bce28 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 25 Jan 2018 14:08:23 -0500
Subject: [PATCH 088/184] [src] Some drafts of the compression code

---
 src/cudamatrix/cu-compressed-matrix.h | 143 ++++++++++++
 src/nnet3/nnet-analyze.cc             |  99 ++++++++-
 src/nnet3/nnet-analyze.h              |  12 +-
 src/nnet3/nnet-computation.cc         |  14 +-
 src/nnet3/nnet-computation.h          |  10 +-
 src/nnet3/nnet-compute.cc             |  33 +++
 src/nnet3/nnet-compute.h              |   8 +
 src/nnet3/nnet-optimize-utils.cc      | 305 ++++++++++++++++++++++++++
 src/nnet3/nnet-optimize-utils.h       |  40 ++++
 src/nnet3/nnet-optimize.cc            |  11 +-
 src/nnet3/nnet-optimize.h             |  13 ++
 11 files changed, 678 insertions(+), 10 deletions(-)
 create mode 100644 src/cudamatrix/cu-compressed-matrix.h

diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h
new file mode 100644
index 00000000000..557892ae266
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix.h
@@ -0,0 +1,143 @@
+// cudamatrix/cu-compressed-matrix.h
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+
+#ifndef KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_
+#define KALDI_CUDAMATRIX_CU_COMPRESSED_MATRIX_H_
+
+#include "cudamatrix/cu-matrix.h"
+
+namespace kaldi {
+
+/**
+   Class CuCompressedMatrixBase is an abstract base class that allows you to
+   compress a matrix of type CuMatrix<BaseFloat>.  When you instantiate it you
+   would choose the child-class type (by allocating the appropriate child-class
+   type via 'new').
+ */
+class CuCompressedMatrixBase {
+ public:
+
+  /// Sets *this to an appropriately compressed copy of 'mat', which
+  /// includes resizing *this.  The details of how this is done will be
+  /// different in different child classes.
+  virtual void CopyFromMat(CuMatrixBase<BaseFloat> &mat) = 0;
+
+  /// Copies the contents of *this to 'mat', which should be
+  /// correctly sized beforehand.
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) = 0;
+
+
+  // The number of rows in *this.
+  virtual int32 NumRows() = 0;
+
+  // The number of columns in *this.
+  virtual int32 NumCols() = 0;
+
+  ~CuCompressedMatrixBase() { }
+};
+
+
+
+/**
+   Class CuCompressedMatrix, templated on an integer type (expected to be one
+   of: int8, uint8, int16, uint16), this provides a way to approximate a
+   CuMatrix in a more memory-efficient format.  It's used in nnet3 to
+   reduce memory use for large networks.
+
+   It is *not* a CUDA equivalent for class CompressedMatrix (of
+   ../matrix/compressed-matrix.h).
+ */
+template <typename I>
+class CuCompressedMatrix: public CuCompressedMatrixBase {
+ public:
+
+
+  /// Constructor which sets 'scale_' according to
+  /// scale_ = range / std::numeric_limits<I>::max().
+  ///
+  /// range = 0 (only supported for I == int8) is a special case in which only
+  /// the sign of the input is retained; and when we reconstruct, the output
+  /// will be -1, 0 or 1.
+  CuCompressedMatrix(BaseFloat range);
+
+  virtual void CopyFromMat(CuMatrixBase<BaseFloat> &mat);
+
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat);
+
+  virtual MatrixIndexT NumRows() { return num_rows_; }
+
+  virtual MatrixIndexT NumCols() { return num_cols_; }
+
+
+  ~CuCompressedMatrix();
+
+ private:
+
+  // The raw data.
+  I *data_;
+
+  // Scale() affects how the raw data is interpreted as a floating point value.
+  // When uncompressing to a CuMatrix, we'll do
+  //  f  = scale_ * i
+  // where f is the floating point value we're writing to, and i is the integer
+  // value.
+  //
+  // scale_ = 0 is treated specially; in this case we just take notice of the
+  // sign of the input, and when uncompressing we do it with a scale such
+  // that the output becomes -1, 0 and 1.
+  BaseFloat scale_;
+
+  MatrixIndexT num_rows_;
+  MatrixIndexT num_cols_;
+  // stride_ is currently always equal to num_cols_; it was added mainly to
+  // point the way to possible future extension.
+  MatrixIndexT stride_;
+};
+
+
+
+// This enum value is used to encode the type you want to instantiate
+// a CuCompressedMatrix with.  It's used in class NnetComputation
+// (cast to int32) as one of the arguments of kCompressMatrix.
+enum {
+  kCompressedMatrixInt8 = 1,
+  kCompressedMatrixUint8 = 2,
+  kCompressedMatrixInt16 = 3,
+  kCompressedMatrixUint16 = 4
+} CuCompressedMatrixType;
+
+/**
+   This function allocates a new CuCompressedMatrix with type determined
+   by t, and with the 'range' parameter provided (range must be >= 0,
+   0 as a special case).
+   It will crash at runtime if called when CUDA is not compiled in, or not
+   enabled.
+ */
+CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
+                                              BaseFloat range);
+
+
+
+
+
+
+
+#endif
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 140a6f9140c..a2517989294 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -367,6 +367,14 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg2, kReadAccess, &attr);
         break;
       }
+      case kCompressMatrix: {
+        vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
+        break;
+      }
+      case kUncompressMatrix: {
+        vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
+        break;
+      }
       case kAcceptInput: {
         vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         break;
@@ -555,6 +563,7 @@ void ComputationChecker::Check() {
   CheckComputationIndexes();
   a_.Init(nnet_, computation_);
   CheckComputationMatrixAccesses();
+  CheckComputationCompression();
   CheckComputationUndefined();
   CheckComputationDebugInfo();
   if (config_.check_rewrite)
@@ -679,6 +688,63 @@ void ComputationChecker::CheckComputationMatrixAccesses() const {
   }
 }
 
+void ComputationChecker::CheckComputationCompression() const {
+  int32 num_matrices = a_.matrix_accesses.size();
+
+  // 'middle_command' will be the index of the command that separates
+  // the forward and backward passes.
+  int32 middle_command = -1;
+  for (size_t i = 0; i < computation->commands.size(); i++) {
+    if (computation->commands[i].command_type == kNoOperationMarker) {
+        middle_command = static_cast<int32>(i);
+        break;
+    }
+  }
+  for (int32 matrix_index = 1; matrix_index < num_matrices; matrix_index++) {
+    const MatrixAccesses &accesses = a_.matrix_accesses[matrix_index];
+    int32 num_accesses = accesses.accesses.size();
+    for (int32 a = 0; a < num_accesses; a++) {
+      const Access &access = accesses.accesses[a];
+      int32 command_index = accesses.command_inex;
+      const NnetComputation::Command &command =
+          computation_.commands[command_index];
+      if (command.command_type == kUncompressMatrix) {
+        // check that the previous access to this matrix was a compression
+        // command.
+        KALDI_ASSERT(
+            a > 0 && computation_.commands[
+                accesses.accesses[a-1].command_index].command_type ==
+            kCompressMatrix);
+
+      if (command.command_type == kCompressMatrix) {
+        // check that the next access to this matrix is an uncompression
+        // command.
+        int32 next_command_index = accesses.accesses[a+1].command_index;
+        KALDI_ASSERT(computation_.commands[next_command_index].command_type ==
+                     kUncompressMatrix &&
+                     command_index < middle_command &&
+                     next_command_index > middle_command);
+        if (command.alpha == 0.0) {
+          // alpha == 0.0 means we're only retaining the sign; we should
+          // only do this if this is the output of a ReLU.
+          // make sure there are only 2 commands after this: the uncompress
+          // command, a relu backprop command, and a deallocation command.
+          KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 &&
+                       num_accesses <= a + 4);
+          // make sure the previous access to that matrix was a ReLU
+          // propagation.
+          int32 previous_command_index = accesses.accesses[a-1].command_index;
+          const NnetComputation::Command &previous_command =
+              computation_.commands[previous_command_index];
+          KALDI_ASSERT(previous_command.command_type == kPropagate &&
+                       nnet_.GetComponent(previous_command.arg1).Type() ==
+                       "RectifiedLinearComponent");
+        }
+      }
+    }
+  }
+}
+
 /**
    This very basic check just makes sure that all indexes in the commands are
    within range, that dimensions agree with the request, that row/column dimensions
@@ -931,6 +997,22 @@ void ComputationChecker::CheckComputationIndexes() const {
         }
         break;
       }
+      case kCompressMatrix: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+        if (c.arg2 < static_cast<int32>(kCompressedMatrixInt8) ||
+            c.arg2 > static_cast<int32>(kCompressedMatrixUint16))
+          KALDI_ERR << "Invalid compressed-matrix type.";
+        if (c.alpha < 0.0 || c.alpha > 1000.0 ||
+            (c.alpha == 0.0 && c.arg1 != kCompressedMatrixInt8))
+          KALDI_ERR << "Invalid alpha in kCompressMatrix command.";
+      }
+      case kUncompressMatrix: {
+        if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
+            !computation_.IsWholeMatrix(c.arg1))
+          KALDI_ERR << "submatrix index out of range or invalid";
+      }
       case kAcceptInput: case kProvideOutput: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             !computation_.IsWholeMatrix(c.arg1))
@@ -1319,13 +1401,22 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       num_submatrices = computation.submatrices.size();
   for (int32 command_index = 0; command_index < num_commands; ++command_index) {
     const NnetComputation::Command &c = computation.commands[command_index];
-    int64 this_num_bytes = -100000000;
+    int64 this_num_bytes = -100000000,
+        this_compressed_num_bytes = -10000000;
     if (c.arg1 >= 0 && c.arg1 < num_submatrices) {
       // if arg1 could plausibly be a sub-matrix index...
       const NnetComputation::SubMatrixInfo &submat_info =
           computation.submatrices[c.arg1];
       this_num_bytes = static_cast<int64>(sizeof(BaseFloat)) *
           submat_info.num_rows * submat_info.num_cols;
+
+      if (c.arg2 >= static_cast<int32>(kCompressedMatrixInt8) &&
+          c.arg2 <= static_cast<int32>(kCompressedMatrixUint16)) {
+        this_compressed_num_bytes =
+            ((c.arg2 == static_cast<int32>(kCompressedMatrixInt8) ||
+             c.arg2 == static_cast<int32>(kCompressedMatrixUint8)) ?
+             1 : 2) * submat_info.num_rows * submat_info.num_cols;
+      }
     }
     switch (c.command_type) {
       case kAllocMatrix:
@@ -1335,6 +1426,12 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       case kDeallocMatrix:
         cur_memory_use -= this_num_bytes;
         break;
+      case kCompressMatrix:
+        cur_memory_use += this_compressed_num_bytes - this_num_bytes;
+        break;
+      case kUncompressMatrix:
+        cur_memory_use += this_num_bytes - this_compressed_num_bytes;
+        break;
       default:
         break;
     }
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index a82cd4cb5b1..2966cf947e4 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -414,15 +414,17 @@ class ComputationChecker {
                      const NnetComputation &computation);
   void Check();  // call this only once.
  private:
-  // various dimension consistency checks and checks on properties.
+  // Various dimension consistency checks and checks on properties.
   void CheckComputationIndexes() const;
-  // checks for a situation where an undefined variable is read.
+  // Checks for a situation where an undefined variable is read.
   void CheckComputationUndefined() const;
-  // checks that all writes are done before reads.  details with implementation.
+  // Checks that all writes are done before reads.  details with implementation.
   void CheckComputationRewrite() const;
-  // check matrix accesses make sense.
+  // Check matrix accesses make sense.
   void CheckComputationMatrixAccesses() const;
-  // check debug_info has the correct size, if used.
+  // Some checks related to the kCompressMatrix and kUncompressMatrix commands.
+  void CheckComputationCompression() const;
+  // Check debug_info has the correct size, if used.
   void CheckComputationDebugInfo() const;
 
   const CheckComputationOptions &config_;
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 77facbdba79..405faa56ede 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -282,6 +282,10 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kAddToRowsMulti;
     } else if (command_type_str == "kAddRowRanges") {
       command_type = kAddRowRanges;
+    } else if (command_type_str == "kCompressMatrix") {
+      command_type = kCompressMatrix;
+    } else if (command_type_str == "kUncompressMatrix") {
+      command_type = kUncompressMatrix;
     } else if (command_type_str == "kAcceptInput") {
       command_type = kAcceptInput;
     } else if (command_type_str == "kProvideOutput") {
@@ -375,6 +379,12 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kAddRowRanges:
         os << "kAddRowRanges\n";
         break;
+      case kCompressMatrix:
+        os << "kCompressMatrix\n";
+        break;
+      case kUncompressMatrix:
+        os << "kUncompressMatrix\n";
+        break;
       case kAcceptInput:
         os << "kAcceptInput\n";
         break;
@@ -689,7 +699,7 @@ void NnetComputation::Print(std::ostream &os, const Nnet &nnet) const {
 }
 
 void NnetComputation::Read(std::istream &is, bool binary) {
-  int32 version = 4,  // must be in sync with 'version' in Write.
+  int32 version = 5,  // must be in sync with 'version' in Write.
       version_in = 1;  // defaults to 1 if no version specified.
 
   ExpectToken(is, binary, "<NnetComputation>");
@@ -823,7 +833,7 @@ void NnetComputation::Read(std::istream &is, bool binary) {
 }
 
 void NnetComputation::Write(std::ostream &os, bool binary) const {
-  int32 version = 4;  // Must be in sync with version in Read.
+  int32 version = 5;  // Must be in sync with version in Read.
   WriteToken(os, binary, "<NnetComputation>");
   WriteToken(os, binary, "<Version>");
   WriteBasicType(os, binary, version);
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 4b1386a1f01..01c51e8e822 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -232,6 +232,13 @@ struct ComputationRequest {
      indexes_ranges[arg3].  We use the "alpha" as if AddRowRanges()
      accepted that argument, even though it doesn't (we fake it using other
      calls, if alpha != 1.0).
+   - kCompressMatrix: Compresses the matrix which should be referred to
+       by submatrix-index arg1.  arg2 is a number that determines the
+       compression type (it's converted from the enum CuCompressed
+       MatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha
+       determines the 'range' parameter (c.f. NewCuCompressedMatrix()).
+   - kUncompressMatrix:  Uncompresses the matrix which is referred to
+      by submatrix-index arg1 (it should previously have been compressed).
    - kAcceptInput: accepts a matrix of input from the user, which may be either
      features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
      a whole matrix that the input goes to, and arg2 is the index of the network
@@ -263,7 +270,8 @@ enum CommandType {
   kPropagate, kBackprop, kBackpropNoModelUpdate,
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-  kAddRowRanges, kAcceptInput, kProvideOutput,
+  kAddRowRanges, kCompressMatrix, kUncompressMatrix,
+  kAcceptInput, kProvideOutput,
   kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel,
   kGotoLabel };
 
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 23286211301..ad63043d851 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -382,6 +382,39 @@ void NnetComputer::ExecuteCommand() {
         }
         break;
       }
+      case kCompressMatrix: {
+        // This does nothing if CUDA is not in use.
+#if HAVE_CUDA == 1
+        if (CuDevice::Instantiate().Enabled()) {
+          if (compressed_matrices_.empty())
+            compressed_matrices_.resize(matrices_.size(), NULL);
+          int32 m = computation_.submatrices[c.arg1].matrix_index;
+          KALDI_ASSERT(compressed_matrices_[m] == NULL &&
+                       matrices_[m].NumRows() != 0);
+          compressed_matrices_[m] = NewCuCompressedMatrix(
+              static_cast<CuCompressedMatrixType>(c.arg2), c.alpha);
+          compressed_matrices_[m]->CopyFromMat(matrices_[m]);
+          matrices_[m].Resize(0, 0);
+        }
+#endif
+      }
+      case kUncompressMatrix: {
+#if HAVE_CUDA == 1
+        if (CuDevice::Instantiate().Enabled()) {
+          int32 m = computation_.submatrices[c.arg1].matrix_index;
+          CuCompressedMatrixBase *compressed_matrix =
+              compressed_matrices_[m];
+          KALDI_ASSERT(compressed_matrix != NULL &&
+                       matrices_[m].NumRows() == 0);
+          matrices_[m].Resize(compressed_matrix->NumRows(),
+                              compressed_matrix->NumCols(),
+                              kUndefined);
+          compressed_matrix->CopyToMat(&(matrices_[m]));
+          delete compressed_matrix;
+          compressed_matrices_[m] = NULL;
+        }
+#endif
+      }
       case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel:
         break;
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index 869dd107bf6..19af856bad8 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -163,6 +163,14 @@ class NnetComputer {
   // NULL).
   std::vector<void*> memos_;
 
+  // This is only used when commands kCompressMatrix and kUncompressMatrix are
+  // invoked.  It will be (the first time we compress a matrix) resized to be
+  // the same size as 'matrices_' (i.e., indexed by matrix index).  When we
+  // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and
+  // resize matrices_[m] to empty; and when we uncompress it, the reverse
+  // happens.
+  std::vector<CuCompressedMatrixBase*> compressed_matrices_;
+
 
   // executes the command in computation_.commands[program_counter_].
   void ExecuteCommand();
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 2a0b2dcd499..0d64165efe4 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -4042,5 +4042,310 @@ void RemoveCommandsForUnusedMatrix(const Analyzer &analyzer,
   }
 }
 
+
+
+// This comparison operator is used in the function InsertCommands()
+// to sort a list of these pairs by the .first element.
+struct CommandPairComparator {
+  // operator () should be viewed as a '<' operator that only looks at
+  // the .first element, treating the .second elements as equal.
+  bool operator () (const std::pair<int32, NnetComputation::Command> &p1,
+                    const std::pair<int32, NnetComputation::Command> &p2) const {
+    return p1.first < p2.first;
+  }
+};
+
+void InsertCommands(
+    std::vector<std::pair<int32, NnetComputation::Command> > *new_commands,
+    NnetComputation *computation) {
+  int32 num_new_commands = new_commands->size(),
+      num_old_commands = computation->commands.size();
+  if (num_new_commands == 0)
+    return;
+  CommandPairOperator comparison_operator;
+  // use std::stable_sort so that for entries in 'new_commands' that
+  // have the same .first value, they stay in the same order they were
+  // in before sorting.
+  std::stable_sort(new_commands->begin(), new_commands->end(),
+                   comparison_operator);
+
+  if (RandInt(0, 3) == 0) {   // check 'new_commands'
+    for (int32 i = 0; i + 1 < num_new_commands; i++) {
+      KALDI_ASSERT((*new_commands)[i].first <= (*new_commands)[i+1].first &&
+                   (*new_commands)[i].first >= 0 &&
+                   (*new_commands)[i+1].first <= num_old_commands);
+    }
+  }
+  std::vector<NnetComputation::Command> merged_commands;
+  merged_commands.reserve(num_old_commands + num_new_commands);
+
+  std::vector<std::pair<int32, NnetComputation::Command> >::const_iterator
+      new_commands_iter = new_commands->begin(),
+      new_commands_end = new_commands->end();
+
+  for (int32 old_command_index = 0; old_command_index <= num_old_commands;
+       old_command_index++) {
+    while (new_commands_iter != new_commands_end &&
+           new_commands_iter->first <= old_command_index) {
+      merged_commands.push_back(new_commands_iter->second);
+      ++new_commands_iter;
+    }
+    if (old_command_index < num_old_commands)
+      merged_commands.push_back(computation->commands[old_command_index]);
+  }
+  KALDI_ASSERT(merged_commands.size() == num_old_commands +
+               num_new_commands);
+  // copy to 'computation->commands' via shallow swap.
+  computation->commands.swap(merged_commands);
+  FixGotoLabel(computation);
+}
+
+/**
+   This class is used in the function OptimizeMemoryCompression(),
+   once we determine that there is some potential to do memory compression
+   for this computation.
+ */
+class MemoryCompressionOptimizer {
+ public:
+
+  /** @param [in] nnet         The neural net the computation is for.
+      @param [in] memory_compression_level.  The level of compression:
+         0 = no compression (the constructor should not be calle with this value).
+         1 = compression that doesn't affect the results (but still takes time).
+         2 = compression that affects the results only very slightly
+         3 = compression that affects the results a little more.
+      @param [in] middle_command  Must be the command-index of the
+          command of type kNoOperationMarker in 'computation'.
+      @param [in,out] computation  The computation we're optimizing.
+  */
+  MemoryCompressionOptimizer(const Nnet &nnet,
+                             int32 memory_compression_level,
+                             int32 middle_command,
+                             NnetComputation *computation):
+      nnet_(nnet), memory_compression_level_(memory_compression_level),
+      middle_command_(middle_command), computation_(computation) { }
+
+  void Optimize();
+ private:
+
+  // This function, called from Compress(), figures out whether we can compress
+  // matrix m, and if so, adds an entry to compress_info_.
+  void ProcessMatrix(int32 m);
+
+  // This function modifies the commands in '*computation_', taking
+  // as input the commands in compress_info_.
+  void ModifyComputation();
+
+  // While deciding what matrices to compress we will create a list of structs
+  // of type MatrixCompressInfo.  Later we copy-and-modify the commands in the
+  // computation, putting the compression commands into their appropriate place.
+  struct MatrixCompressInfo {
+    // m is the matrix-index of the matrix we're going to compress.
+    int32 m;
+    // compression_command_index is the command-index of the command
+    // *after* which we will place the compression command.  Normally
+    // this will be some type of propagation.
+    int32 compression_command_index;
+    // compression_command_index is the command-index of the command
+    // *before* which we will place the uncompression command.  Normally
+    // this will be some type of backprop.
+    int32 uncompression_command_index;
+    // 'compression_type' (e.g. kCompressedMatrixInt8) determines the type
+    // we compress the BaseFloats to.
+    CuCompressedMatrixType compression_type;
+    // 'range' determines range of values that the compressed values can
+    // be in: for signed types they are in [-range, range], for unsigned
+    // types, in [0, range].
+    // As a special case, range = 0 means that the compression just stores the
+    // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this
+    // is useful for ReLUs.
+    BaseFloat range;
+
+    MatrixCompressInfo(int32 m, int32 forward_command_index,
+                       int32 backward_command_index,
+                       CuCompressedMatrixType compression_type,
+                       BaseFloat range):
+        m(m), compression_command_index(forward_command_index),
+        uncompression_command_index(backward_command_index),
+        compression_type(compression_type), range(range) { }
+
+  };
+  std::vector<MatrixCompressInfo> compress_info_;
+
+  const Nnet &nnet_;
+  int32 memory_compression_level_;
+  NnetComputation *computation_;
+  Analyzer analyzer_;
+};
+
+
+void MemoryCompressionOptimizer::ModifyComputation() {
+  int32 cur_num_commands = computation_->commands.size();
+
+  // whole_submatrices[m] is the submatrix-index of the submatrix that
+  // represents the whole of matrix m.
+  std::vector<int32> whole_submatrices;
+  computation_->GetWholeSubmatrices(&whole_submatrices);
+
+  // 'pairs_to_insert' will be a list of pairs (command-index, command),
+  // meaning: (command-index just before which to insert this command; command
+  // to insert).
+  std::vector<std::pair<int32, NnetComputation::Command> >
+      pairs_to_insert;
+  pairs_to_insert.reserve(compress_info_.size() * 2);
+  for (size_t i = 0; i < compress_info_.size(); i++) {
+    const MatrixCompressInfo &info = compress_info_[i];
+    int32 s = whole_submatrices[info.m];
+    // below we use compression_command_index + 1 because we want the
+    // compression to go after the command in 'info.compression_command_index'
+    // (which might be, for instance, a forward propagation command).
+    std::pair<int32, NnetComputation::Command> p1(
+        info.compression_command_index + 1,
+        NnetComputation::Command(info.range, kCompressMatrix,
+                                 s, static_cast<int32>(info.compression_type)));
+    pairs_to_insert.push_back(p1);
+    std::pair<int32, NnetComputation::Command> p2(
+        info.uncompression_command_index,
+        NnetComputation::Command(1.0, kUncompressMatrix, s));
+    pairs_to_insert.push_back(p2);
+  }
+  InsertCommands(&pairs_to_insert,
+                 computation_);
+}
+
+
+void MemoryCompressionOptimizer::Optimize() {
+  analyzer_.Init(nnet_, *computation_);
+  // note: matrix zero is not really a matrix.
+  int32 num_matrices = computation_->matrices.size();
+  for (int32 m = 1; m < num_matrices; m++)
+    ProcessMatrix(m);
+  if (!compress_info_.empty())
+    ModifyComputatin();
+}
+
+void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
+  // 'accesses' list the commands that access this matrix.
+  const std::vector<Access> &accesses = analyzer_.matrix_accesses[m].accesses;
+  Access middle_access;
+  middle_access.command_index = middle_command_;
+  std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
+                                                              accesses.end(),
+                                                              middle_access);
+  // At this point, 'iter' points to the first access in 'accesses'
+  // whose command index is >= 'middle_command_' (which separates the forward
+  // and backward passes), or accesses.end() if this matrix was not
+  // accessed during the backward pass.
+  if (iter == accesses.end()) {
+    return;  // There is nothing to do: this matrix was not accessed during the
+             // backward pass.
+  }
+  if (iter == accesses.begin()) {
+    return;  // There is nothing to do: this matrix was not accessed during the
+             // forward pass.
+  }
+  // 'backward_access' is the first access of the matrix in the backward
+  // pass of the computation, and
+  // 'forward_access' is the last access of the matrix in the forward pass
+  // of the computation.
+  const Access &backward_access = iter[0],
+      &forward_access = iter[-1];
+  KALDI_ASSERT(forward_access.command_index < middle_command_ &&
+               backward_access.command_index > middle_command_);
+  // 'backward_access_is_last_access' is going to be set to true if
+  // 'backward_access' is the last command to access the matrix (apart from
+  // deallocation commands).
+  bool backward_access_is_last_access = false;
+  if (accesses.end() - backward_access <= 2) {
+    // if there is at most 1 command after 'backward_access'...
+    const Access &next_access = iter[1];
+    NnetComputation::Command &next_command =
+        computation_->commands[next_access.command_index];
+    if (next_command.command_type == kDeallocMatrix ||
+        next_command.command_type == kSwapMatrix)
+      backward_access_is_last_access = true;
+  }
+  int32 backward_command_index = backward_access.command_index,
+      forward_command_index = forward_access.command_index;
+  NnetComputation::Command
+      &forward_command = computation_->commands[forward_command_index],
+      &backward_command = computation_->commands[backward_command_index];
+
+  if (memory_compression_level_ >= 1 &&
+      backward_access_is_last_access &&
+      forward_access.access_type == kWriteAccess &&
+      backward_access.access_type == kReadAccess &&
+      forward_command.command_type == kPropagate &&
+      backward_command.command_type == kBackprop) {
+    int32 component_index = backward_access.arg1;
+    const Component *component = nnet_.GetComponent(component_index);
+    // this is potentially a candidate for our optimization for ReLU units,
+    // where we only store the sign.
+    if (component->Type() == "RectifiedLinearComponent" &&
+        component_index == forward_access.arg1) {
+      compress_info_.push_back(
+          MatrixCompressInfo(m, forward_command_index,
+                             backward_command_index,
+                             kCompressedMatrixUint8, 0.0));
+      return;
+    }
+  }
+
+  // TODO: we can later implement compression for other cases.
+  //
+
+}
+
+
+
+
+void OptimizeMemoryCompression(const Nnet &nnet,
+                               int32 memory_compression_level,
+                               NnetComputation *computation) {
+  if (memory_compression_level == 0 || computation->commands.empty())
+    return;
+  // don't apply this optimization to looped computations.
+  if (computation->commands.back().command_type == kGotoLabel)
+    return;
+
+  // 'middle_command' will be the index of the command of type
+  // 'kNoOperationMarker' that separates the forward and backward
+  // passes.  If it doesn't exist, it means this computation doesn't
+  // include
+  int32 middle_command = -1;
+  for (size_t i = 0; i < computation->commands.size(); i++) {
+    if (computation->commands[i].command_type == kNoOperationMarker) {
+      if (middle_command < 0) {
+        middle_command = static_cast<int32>(i);
+      } else {
+        KALDI_WARN << "Found more than one command of tyep kNoOperationMarker "
+            "in non-looped computation.";
+        // there are more than one command of this type... this wasn't expected.
+        return false;
+      }
+    }
+  }
+  if (middle_command == -1) {
+    return;  // This computation doesn't have a backprop pass.
+  }
+  if (memory_compression_level >= 1) {
+    int64 bytes_used_initial, bytes_used_final;
+    if (GetVerboseLevel() >= 2)
+      bytes_used_initial = GetMaxMemoryUse(*computation);
+
+    MemoryCompressionOptimizer opt(nnet, memory_compression_level,
+                                   middle_command, computation);
+    opt.Optimize();
+
+    if (GetVerboseLevel() >= 2) {
+      bytes_used_final = GetMaxMemoryUse(*computation);
+      KALDI_VLOG(2) << "Memory compression reduced  memory use from "
+                    << bytes_used_initial << " to "
+                    << bytes_used_final << " bytes.";
+    }
+  }
+}
+
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 98615e2e146..1ffcc330adf 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -524,6 +524,46 @@ void IdentifyIndexesArgs(std::vector<NnetComputation::Command> *commands,
 void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
+/// Inserts commands into the computation at the requested places.  'commands'
+///  is a list of pairs (command-index, command) that is expected to be sorted
+///  on command-index.  For each entry (c, command) in 'commands', 'command' is
+///  inserted into 'computation' just *before* the command that (at entry) is in
+///  computation->commands[c].  If there are multiple pairs with the same index
+///  c, they will remain in the same order in which they were present in
+///  'commands'; however, 'commands' does not have to be sorted on 'c'.
+///  As a special case, if c == computation->commands.size(), the
+///  corresponding commands are inserted at the beginning of the computation.
+///  This function will appropriately renumber the argument of the kGotoLabel
+///  command of any 'looped' computation.  Command indexes c in commands[*].first
+///  must be in the range [0, computation->commands.size()].
+///  This function may modify 'commands' by sorting it.
+void InsertCommands(
+    std::vector<std::pair<int32, NnetComputation::Command> > *commands,
+    NnetComputation *computation);
+
+/// Performs optimization to reduce memory usage where possible,
+/// making use of the kCompressMatrix and kUncompressMatrix commands.
+/// Should only be done after most other optimizations, because some
+/// optimizations (such as variable-merging) would not work correctly
+/// after doing this optimization.  This does nothing for looped
+/// computations.  It's OK, though, to expand a shortcut computation
+/// (i.e. call ExpandComputation) after doing this.
+///
+/// memory_compression_level determines how aggressive the compression
+/// is.  Allowed values:
+///       0 = no compression at all
+///       1 = compression that doesn't affect results (e.g. compress
+///           ReLU outputs to 1 byte, as just the sign is needed).
+///       2 = compression that may affect the results slightly (e.g. 16-bit
+///           compression of the output of NormalizeComponent and the like),
+///           but this is not implemented yet, so equivalent to 1.
+///       3 = compression that may affect the results more than just
+///           slightly.  Not implemented yet, so equivalent to 1.
+void OptimizeMemoryCompression(const Nnet &nnet,
+                               int32 memory_compression_level,
+                               NnetComputation *computation);
+
+
 /// This function tries to optimize computation 'computation' for an 'looped'
 /// computation.  It expects as input a computation with no backprop but with
 /// multiple 'segments' separated by command kNoOperationLabel, where each
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 0eb5de2c4fc..3dff8c0a4f3 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -554,7 +554,7 @@ void Optimize(const NnetOptimizeOptions &config,
   // the looped computation optimization has to go before
   // 'RemoveUnnecessaryAllocation()'.  We don't gate this by 'config.optimize'
   // because it's necessary for looped computation to run.
-  if (config.optimize_looped_computation){
+  if (config.optimize_looped_computation) {
     OptimizeLoopedComputation(nnet, computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
@@ -579,6 +579,15 @@ void Optimize(const NnetOptimizeOptions &config,
   if (config.optimize_looped_computation)
     FixGotoLabel(computation);
 
+
+  if (config.memory_compression_level > 0 &&
+      !config.optimize_looped_computation) {
+    OptimizeMemoryCompression(nnet_, config.memory_compression_level,
+                              computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
+
   if (GetVerboseLevel() >= 3) {
     CheckComputation(nnet, *computation, false);
     KALDI_LOG << "After optimization, max memory use (bytes) = "
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 4ffa4de449e..d3ecf01847a 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -32,6 +32,7 @@ namespace nnet3 {
 // Options class for optimizing a NnetComputation.  The main projected use for
 // this is in debugging the optimization code itself, so that if an error is
 // detected, we can work out which optimization was responsible for the error.
+// See the Register() function below for option-specific documentation.
 struct NnetOptimizeOptions {
   bool optimize;  // setting this false disallow all optimization.
   bool consolidate_model_update;
@@ -49,6 +50,7 @@ struct NnetOptimizeOptions {
   int32 max_deriv_time;
   int32 max_deriv_time_relative;
   bool snip_row_ops;
+  int32 memory_compression_level;
   // optimize_looped_computation is a 'hidden config' not available from
   // the command line; it's set to true to enable the optimization for
   // looped computation that turns a linear computation into a loop.
@@ -71,6 +73,7 @@ struct NnetOptimizeOptions {
       max_deriv_time(std::numeric_limits<int32>::max()),
       max_deriv_time_relative(std::numeric_limits<int32>::max()),
       snip_row_ops(true),
+      memory_compression_level(0),
       optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
@@ -123,6 +126,16 @@ struct NnetOptimizeOptions {
     opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
                    "disable an optimization that reduces the size of certain "
                    "per-row operations");
+    opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
+                   "disable an optimization that reduces the size of certain "
+                   "per-row operations");
+    opts->Register("memory-compression-level", &memory_compression_level,
+                   "This is only relevant to training, not decoding.  Set this "
+                   "to 0,1,2,3; higher levels are more aggressive at reducing "
+                   "memory by compressing quantities needed for backprop, "
+                   "potentially at the expense of speed and the accuracy "
+                   "of derivatives.  0 means no compression at all.");
+
   }
   void Read(std::istream &is, bool binary);
   void Write(std::ostream &os, bool binary) const;

From 3031e26e6744f8454c268696829285cc3ff6bae1 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <chenzhehuai@foxmail.com>
Date: Thu, 25 Jan 2018 18:27:28 -0500
Subject: [PATCH 089/184] [src] Optimization to decoders for speed (#2168)

---
 src/decoder/lattice-faster-decoder.cc        | 62 +++++++++++++----
 src/decoder/lattice-faster-decoder.h         | 11 ++-
 src/decoder/lattice-faster-online-decoder.cc | 72 ++++++++++++++++----
 src/decoder/lattice-faster-online-decoder.h  | 13 +++-
 4 files changed, 126 insertions(+), 32 deletions(-)

diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index c5c9aae743c..963430a63f1 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -3,6 +3,7 @@
 // Copyright 2009-2012  Microsoft Corporation  Mirko Hannemann
 //           2013-2014  Johns Hopkins University (Author: Daniel Povey)
 //                2014  Guoguo Chen
+//                2018  Zhehuai Chen
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -68,7 +69,7 @@ void LatticeFasterDecoder::InitDecoding() {
   active_toks_[0].toks = start_tok;
   toks_.Insert(start_state, start_tok);
   num_toks_++;
-  ProcessNonemitting(config_.beam);
+  ProcessNonemittingWrapper(config_.beam);
 }
 
 // Returns true if any kind of traceback is available (not necessarily from
@@ -84,8 +85,8 @@ bool LatticeFasterDecoder::Decode(DecodableInterface *decodable) {
   while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
     if (NumFramesDecoded() % config_.prune_interval == 0)
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
-    BaseFloat cost_cutoff = ProcessEmitting(decodable);
-    ProcessNonemitting(cost_cutoff);
+    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
+    ProcessNonemittingWrapper(cost_cutoff);
   }
   FinalizeDecoding();
 
@@ -588,8 +589,8 @@ void LatticeFasterDecoder::AdvanceDecoding(DecodableInterface *decodable,
     if (NumFramesDecoded() % config_.prune_interval == 0) {
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     }
-    BaseFloat cost_cutoff = ProcessEmitting(decodable);
-    ProcessNonemitting(cost_cutoff);
+    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
+    ProcessNonemittingWrapper(cost_cutoff);
   }
 }
 
@@ -683,6 +684,7 @@ BaseFloat LatticeFasterDecoder::GetCutoff(Elem *list_head, size_t *tok_count,
   }
 }
 
+template <typename FstType>
 BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   KALDI_ASSERT(active_toks_.size() > 0);
   int32 frame = active_toks_.size() - 1; // frame is the frame-index
@@ -707,6 +709,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
 
   BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
   // dynamic range.
+  const FstType &fst = dynamic_cast<const FstType&>(fst_);
 
   // First process the best token to get a hopefully
   // reasonably tight bound on the next cutoff.  The only
@@ -715,15 +718,13 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
     StateId state = best_elem->key;
     Token *tok = best_elem->val;
     cost_offset = - tok->tot_cost;
-    for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+    for (fst::ArcIterator<FstType> aiter(fst, state);
          !aiter.Done();
          aiter.Next()) {
-      Arc arc = aiter.Value();
+      const Arc &arc = aiter.Value();
       if (arc.ilabel != 0) {  // propagate..
-        arc.weight = Times(arc.weight,
-                           Weight(cost_offset -
-                                  decodable->LogLikelihood(frame, arc.ilabel)));
-        BaseFloat new_weight = arc.weight.Value() + tok->tot_cost;
+        BaseFloat new_weight = arc.weight.Value() + cost_offset - 
+            decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost;
         if (new_weight + adaptive_beam < next_cutoff)
           next_cutoff = new_weight + adaptive_beam;
       }
@@ -744,7 +745,7 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
     StateId state = e->key;
     Token *tok = e->val;
     if (tok->tot_cost <= cur_cutoff) {
-      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+      for (fst::ArcIterator<FstType> aiter(fst, state);
            !aiter.Done();
            aiter.Next()) {
         const Arc &arc = aiter.Value();
@@ -775,12 +776,31 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   return next_cutoff;
 }
 
+template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::ConstFst<fst::StdArc>>(
+        DecodableInterface *decodable);
+template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::VectorFst<fst::StdArc>>(
+        DecodableInterface *decodable);
+template BaseFloat LatticeFasterDecoder::ProcessEmitting<fst::Fst<fst::StdArc>>(
+        DecodableInterface *decodable);
+
+BaseFloat LatticeFasterDecoder::ProcessEmittingWrapper(DecodableInterface *decodable) {
+  if (fst_.Type() == "const") {
+    return LatticeFasterDecoder::ProcessEmitting<fst::ConstFst<Arc>>(decodable);
+  } else if (fst_.Type() == "vector") {
+    return LatticeFasterDecoder::ProcessEmitting<fst::VectorFst<Arc>>(decodable);
+  } else {
+    return LatticeFasterDecoder::ProcessEmitting<fst::Fst<Arc>>(decodable);
+  }
+}
+
+template <typename FstType> 
 void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
   KALDI_ASSERT(!active_toks_.empty());
   int32 frame = static_cast<int32>(active_toks_.size()) - 2;
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
+  const FstType &fst = dynamic_cast<const FstType&>(fst_);
 
   // Processes nonemitting arcs for one frame.  Propagates within toks_.
   // Note-- this queue structure is is not very optimal as
@@ -812,7 +832,7 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
     // but since most states are emitting it's not a huge issue.
     tok->DeleteForwardLinks(); // necessary when re-visiting
     tok->links = NULL;
-    for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+    for (fst::ArcIterator<FstType> aiter(fst, state);
          !aiter.Done();
          aiter.Next()) {
       const Arc &arc = aiter.Value();
@@ -837,6 +857,22 @@ void LatticeFasterDecoder::ProcessNonemitting(BaseFloat cutoff) {
   } // while queue not empty
 }
 
+template void LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<fst::StdArc>>(
+        BaseFloat cutoff);
+template void LatticeFasterDecoder::ProcessNonemitting<fst::VectorFst<fst::StdArc>>(
+        BaseFloat cutoff);
+template void LatticeFasterDecoder::ProcessNonemitting<fst::Fst<fst::StdArc>>(
+        BaseFloat cutoff);
+
+void LatticeFasterDecoder::ProcessNonemittingWrapper(BaseFloat cost_cutoff) {
+  if (fst_.Type() == "const") {
+    return LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
+  } else if (fst_.Type() == "vector") {
+    return LatticeFasterDecoder::ProcessNonemitting<fst::VectorFst<Arc>>(cost_cutoff);
+  } else {
+    return LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
+  }
+}
 
 void LatticeFasterDecoder::DeleteElems(Elem *list) {
   for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
diff --git a/src/decoder/lattice-faster-decoder.h b/src/decoder/lattice-faster-decoder.h
index fd1b2779fe1..56e4af1b95b 100644
--- a/src/decoder/lattice-faster-decoder.h
+++ b/src/decoder/lattice-faster-decoder.h
@@ -3,6 +3,7 @@
 // Copyright 2009-2013  Microsoft Corporation;  Mirko Hannemann;
 //           2013-2014  Johns Hopkins University (Author: Daniel Povey)
 //                2014  Guoguo Chen
+//                2018  Zhehuai Chen
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -339,12 +340,18 @@ class LatticeFasterDecoder {
 
   /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
   /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
-  BaseFloat ProcessEmitting(DecodableInterface *decodable);
+  /// Templated on FST type for speed; called via ProcessEmittingWrapper().  
+  template <typename FstType> BaseFloat ProcessEmitting(DecodableInterface *decodable);
+
+  BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable);
 
   /// Processes nonemitting (epsilon) arcs for one frame.  Called after
   /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
   /// preceding ProcessEmitting().
-  void ProcessNonemitting(BaseFloat cost_cutoff);
+  /// the templated design is similar to ProcessEmitting()
+  template <typename FstType> void ProcessNonemitting(BaseFloat cost_cutoff);
+
+  void ProcessNonemittingWrapper(BaseFloat cost_cutoff);
 
   // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
   // more than one list (e.g. for current and previous frames), but only one of
diff --git a/src/decoder/lattice-faster-online-decoder.cc b/src/decoder/lattice-faster-online-decoder.cc
index cd7b564b721..5fb2ef25a3d 100644
--- a/src/decoder/lattice-faster-online-decoder.cc
+++ b/src/decoder/lattice-faster-online-decoder.cc
@@ -4,6 +4,7 @@
 //           2013-2014  Johns Hopkins University (Author: Daniel Povey)
 //                2014  Guoguo Chen
 //                2014  IMSL, PKU-HKUST (author: Wei Shi)
+//                2018  Zhehuai Chen
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -68,7 +69,7 @@ void LatticeFasterOnlineDecoder::InitDecoding() {
   active_toks_[0].toks = start_tok;
   toks_.Insert(start_state, start_tok);
   num_toks_++;
-  ProcessNonemitting(config_.beam);
+  ProcessNonemittingWrapper(config_.beam);
 }
 
 // Returns true if any kind of traceback is available (not necessarily from
@@ -84,8 +85,8 @@ bool LatticeFasterOnlineDecoder::Decode(DecodableInterface *decodable) {
   while (!decodable->IsLastFrame(NumFramesDecoded() - 1)) {
     if (NumFramesDecoded() % config_.prune_interval == 0)
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
-    BaseFloat cost_cutoff = ProcessEmitting(decodable);  // Note: the value returned by
-    ProcessNonemitting(cost_cutoff);
+    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);  // Note: the value returned by
+    ProcessNonemittingWrapper(cost_cutoff);
   }
   FinalizeDecoding();
 
@@ -763,8 +764,8 @@ void LatticeFasterOnlineDecoder::AdvanceDecoding(DecodableInterface *decodable,
       PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
     }
     // note: ProcessEmitting() increments NumFramesDecoded().
-    BaseFloat cost_cutoff = ProcessEmitting(decodable);
-    ProcessNonemitting(cost_cutoff);
+    BaseFloat cost_cutoff = ProcessEmittingWrapper(decodable);
+    ProcessNonemittingWrapper(cost_cutoff);
   }
 }
 
@@ -861,6 +862,7 @@ BaseFloat LatticeFasterOnlineDecoder::GetCutoff(Elem *list_head, size_t *tok_cou
 }
 
 
+template <typename FstType>
 BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
     DecodableInterface *decodable) {
   KALDI_ASSERT(active_toks_.size() > 0);
@@ -883,6 +885,7 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
 
   BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
   // dynamic range.
+  const FstType &fst = dynamic_cast<const FstType&>(fst_);
 
   // First process the best token to get a hopefully
   // reasonably tight bound on the next cutoff.  The only
@@ -891,15 +894,13 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
     StateId state = best_elem->key;
     Token *tok = best_elem->val;
     cost_offset = - tok->tot_cost;
-    for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+    for (fst::ArcIterator<FstType> aiter(fst, state);
          !aiter.Done();
          aiter.Next()) {
-      Arc arc = aiter.Value();
+      const Arc &arc = aiter.Value();
       if (arc.ilabel != 0) {  // propagate..
-        arc.weight = Times(arc.weight,
-                           Weight(cost_offset -
-                                  decodable->LogLikelihood(frame, arc.ilabel)));
-        BaseFloat new_weight = arc.weight.Value() + tok->tot_cost;
+        BaseFloat new_weight = arc.weight.Value() + cost_offset - 
+            decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost;
         if (new_weight + adaptive_beam < next_cutoff)
           next_cutoff = new_weight + adaptive_beam;
       }
@@ -919,8 +920,8 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
     // loop this way because we delete "e" as we go.
     StateId state = e->key;
     Token *tok = e->val;
-    if (tok->tot_cost <=  cur_cutoff) {
-      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+    if (tok->tot_cost <= cur_cutoff) {
+      for (fst::ArcIterator<FstType> aiter(fst, state);
            !aiter.Done();
            aiter.Next()) {
         const Arc &arc = aiter.Value();
@@ -951,12 +952,35 @@ BaseFloat LatticeFasterOnlineDecoder::ProcessEmitting(
   return next_cutoff;
 }
 
+template BaseFloat LatticeFasterOnlineDecoder::
+    ProcessEmitting<fst::ConstFst<fst::StdArc>>(DecodableInterface *decodable);
+template BaseFloat LatticeFasterOnlineDecoder::
+    ProcessEmitting<fst::VectorFst<fst::StdArc>>(DecodableInterface *decodable);
+template BaseFloat LatticeFasterOnlineDecoder::
+    ProcessEmitting<fst::Fst<fst::StdArc>>(DecodableInterface *decodable);
+
+BaseFloat LatticeFasterOnlineDecoder::ProcessEmittingWrapper(
+        DecodableInterface *decodable) {
+  if (fst_.Type() == "const") {
+    return LatticeFasterOnlineDecoder::
+        ProcessEmitting<fst::ConstFst<Arc>>(decodable);
+  } else if (fst_.Type() == "vector") {
+    return LatticeFasterOnlineDecoder::
+        ProcessEmitting<fst::VectorFst<Arc>>(decodable);
+  } else {
+    return LatticeFasterOnlineDecoder::
+        ProcessEmitting<fst::Fst<Arc>>(decodable);
+  }
+}
+
+template <typename FstType> 
 void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) {
   KALDI_ASSERT(!active_toks_.empty());
   int32 frame = static_cast<int32>(active_toks_.size()) - 2;
   // Note: "frame" is the time-index we just processed, or -1 if
   // we are processing the nonemitting transitions before the
   // first frame (called from InitDecoding()).
+  const FstType &fst = dynamic_cast<const FstType&>(fst_);
 
   // Processes nonemitting arcs for one frame.  Propagates within toks_.
   // Note-- this queue structure is is not very optimal as
@@ -988,7 +1012,7 @@ void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) {
     // but since most states are emitting it's not a huge issue.
     tok->DeleteForwardLinks(); // necessary when re-visiting
     tok->links = NULL;
-    for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+    for (fst::ArcIterator<FstType> aiter(fst, state);
          !aiter.Done();
          aiter.Next()) {
       const Arc &arc = aiter.Value();
@@ -1013,6 +1037,26 @@ void LatticeFasterOnlineDecoder::ProcessNonemitting(BaseFloat cutoff) {
   } // while queue not empty
 }
 
+template void LatticeFasterOnlineDecoder::
+    ProcessNonemitting<fst::ConstFst<fst::StdArc>>(BaseFloat cutoff);
+template void LatticeFasterOnlineDecoder::
+    ProcessNonemitting<fst::VectorFst<fst::StdArc>>(BaseFloat cutoff);
+template void LatticeFasterOnlineDecoder::
+    ProcessNonemitting<fst::Fst<fst::StdArc>>(BaseFloat cutoff);
+
+void LatticeFasterOnlineDecoder::ProcessNonemittingWrapper(
+        BaseFloat cost_cutoff) {
+  if (fst_.Type() == "const") {
+    return LatticeFasterOnlineDecoder::
+        ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
+  } else if (fst_.Type() == "vector") {
+    return LatticeFasterOnlineDecoder::
+        ProcessNonemitting<fst::VectorFst<Arc>>(cost_cutoff);
+  } else {
+    return LatticeFasterOnlineDecoder::
+        ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
+  }
+}
 
 void LatticeFasterOnlineDecoder::DeleteElems(Elem *list) {
   for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
diff --git a/src/decoder/lattice-faster-online-decoder.h b/src/decoder/lattice-faster-online-decoder.h
index b69b5492fb7..6cf0503d891 100644
--- a/src/decoder/lattice-faster-online-decoder.h
+++ b/src/decoder/lattice-faster-online-decoder.h
@@ -3,6 +3,7 @@
 // Copyright 2009-2013  Microsoft Corporation;  Mirko Hannemann;
 //           2013-2014  Johns Hopkins University (Author: Daniel Povey)
 //                2014  Guoguo Chen
+//                2018  Zhehuai Chen
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -337,12 +338,18 @@ class LatticeFasterOnlineDecoder {
 
   /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
   /// Returns the cost cutoff for subsequent ProcessNonemitting() to use.
-  BaseFloat ProcessEmitting(DecodableInterface *decodable);
+  /// Templated on FST type for speed; called via ProcessEmittingWrapper().
+  template <typename FstType> BaseFloat ProcessEmitting(DecodableInterface *decodable);
+
+  BaseFloat ProcessEmittingWrapper(DecodableInterface *decodable);
 
   /// Processes nonemitting (epsilon) arcs for one frame.  Called after
   /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
   /// preceding ProcessEmitting().
-  void ProcessNonemitting(BaseFloat cost_cutoff);
+  /// the templated design is similar to ProcessEmitting()
+  template <typename FstType> void ProcessNonemitting(BaseFloat cost_cutoff);
+
+  void ProcessNonemittingWrapper(BaseFloat cost_cutoff);
 
   // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
   // more than one list (e.g. for current and previous frames), but only one of
@@ -361,7 +368,7 @@ class LatticeFasterOnlineDecoder {
   // make it class member to avoid internal new/delete.
   const fst::Fst<fst::StdArc> &fst_;
   bool delete_fst_;
-  std::vector<BaseFloat> cost_offsets_;  // This contains, for each
+  std::vector<BaseFloat> cost_offsets_; // This contains, for each
   // frame, an offset that was added to the acoustic log-likelihoods on that
   // frame in order to keep everything in a nice dynamic range i.e.  close to
   // zero, to reduce roundoff errors.

From ee518b106e4dc334b3c02752a1bda01f66abb120 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 27 Jan 2018 00:54:46 -0500
Subject: [PATCH 090/184] [src] Fixes to compression and matrix-extend code;
 started work on CUDA stuff.

---
 src/cudamatrix/cu-compressed-matrix.h |  17 +-
 src/cudamatrix/cu-kernels-ansi.h      |  16 ++
 src/cudamatrix/cu-kernels.cu          |  52 +++++
 src/cudamatrix/cu-kernels.h           |  24 +++
 src/cudamatrix/cu-matrix-lib.h        |   1 +
 src/nnet3/nnet-analyze.cc             |  24 +--
 src/nnet3/nnet-optimize-utils.cc      | 264 ++++++++++++++++++++++++--
 src/nnet3/nnet-optimize-utils.h       |  10 +
 src/nnet3/nnet-optimize.cc            |  10 +-
 src/nnet3/nnet-optimize.h             |   5 +
 10 files changed, 381 insertions(+), 42 deletions(-)

diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h
index 557892ae266..0be1bb391fb 100644
--- a/src/cudamatrix/cu-compressed-matrix.h
+++ b/src/cudamatrix/cu-compressed-matrix.h
@@ -63,13 +63,15 @@ class CuCompressedMatrixBase {
    reduce memory use for large networks.
 
    It is *not* a CUDA equivalent for class CompressedMatrix (of
-   ../matrix/compressed-matrix.h).
+   ../matrix/compressed-matrix.h).  Note: this class is only to be used when you
+   are using a GPU.  If you didn't compile for CUDA or you are not using a GPU,
+   you are not supposed to create an instance of this class, and doing so will
+   cause a runtime error.
  */
 template <typename I>
 class CuCompressedMatrix: public CuCompressedMatrixBase {
  public:
 
-
   /// Constructor which sets 'scale_' according to
   /// scale_ = range / std::numeric_limits<I>::max().
   ///
@@ -90,6 +92,8 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
   ~CuCompressedMatrix();
 
  private:
+  // If there was data in 'data_', frees it, and sets it to NULL.
+  void Destroy();
 
   // The raw data.
   I *data_;
@@ -117,12 +121,12 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
 // This enum value is used to encode the type you want to instantiate
 // a CuCompressedMatrix with.  It's used in class NnetComputation
 // (cast to int32) as one of the arguments of kCompressMatrix.
-enum {
+enum  CuCompressedMatrixType {
   kCompressedMatrixInt8 = 1,
   kCompressedMatrixUint8 = 2,
   kCompressedMatrixInt16 = 3,
   kCompressedMatrixUint16 = 4
-} CuCompressedMatrixType;
+};
 
 /**
    This function allocates a new CuCompressedMatrix with type determined
@@ -135,9 +139,6 @@ CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
                                               BaseFloat range);
 
 
-
-
-
-
+} // namespace kaldi
 
 #endif
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 7d2db9adcc9..8a95ca09537 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -30,6 +30,15 @@
 #if HAVE_CUDA == 1
 extern "C" {
 
+// "C" version of the BaseFloat typedef-- this saves us having to write
+// multiple versions of these kernels.
+#if (KALDI_DOUBLEPRECISION != 0)
+typedef double  BaseFloat;
+#else
+typedef float   BaseFloat;
+#endif
+
+
 void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,
                            const MatrixDim d, const double alpha,
                            const double beta);
@@ -736,6 +745,13 @@ void cudaF_vec_soft_max(int Gr, int Bl, float* v, int dim);
 void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
 void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
+
+void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src,
+                                   MatrixDim dim, int16_t *dest,
+                                   int dest_stride, double inv_scale);
+void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
+                             unsigned char *dest, int dest_stride);
+
 } // extern "C"
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 2f8f37224be..f62a07d8917 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3558,6 +3558,47 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m
   }
 }
 
+
+__global__
+static void _cuda_compress_int8_sign(const BaseFloat *src, MatrixDim dim,
+                                     unsigned char *dest, int dest_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  if (i < d.cols && j < d.rows) {
+    BaseFloat f = src[src_index];
+    dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0);
+  }
+}
+
+
+// this version of the function will only be used if BaseFloat is double.
+__global__
+static void _cuda_compress_double_to_int16(const double *src, MatrixDim dim,
+                                           int16_t *dest, int dest_stride,
+                                           double inv_scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  int ok = (i < d.cols && j < d.rows);
+  if  (ok) {
+    BaseFloat f = src[src_index];
+    int i = __double2int_rn(f * inv_scale);
+    // note: SignedInt will be int8 or (more likely) int16.
+    int16_t s;
+    if (i < -32768) s = -32768;
+    else if (i > 32767)  s = 32767;
+    else s = i;
+  }
+  __syncthreads();
+  if (ok) {
+    dest[dest_index] = s;
+  }
+}
+
+
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
  */
@@ -5220,3 +5261,14 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
   _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
 }
 
+void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
+                             unsigned char *dest, int dest_stride) {
+  _cuda_compress_int8_sign<<<Gr, Bl>>>(src, dim, dest, dest_stride);
+}
+
+void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src,
+                                   MatrixDim dim, int16_t *dest,
+                                   int dest_stride, double inv_scale) {
+  _cuda_compress_double_to_int16<<<Gr, Bl>>>(src, dim, dest, dest_stride,
+                                             inv_scale);
+}
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 27ccf760557..dba1a0516a3 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1463,6 +1463,30 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
   cudaF_vec_sum(Gr, Bl, v, value, dim, inc);
 }
 
+// Compresses the matrix in 'src' to 'dest', retaining only zero-one
+// information (1 if the value is >0, 0 otherwise)
+inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                                   MatrixDim dim, unsigned char *dest,
+                                   int dest_stride) {
+  cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride);
+}
+// this template handles the other types that are not instantiated yet,
+// to avoid compilation errors.
+template <typename I>
+inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                                   MatrixDim dim, I *dest,
+                                   int dest_stride) {
+  KALDI_ERR << "Not implemented for this type.";
+}
+
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, unsigned char *dest,
+                              int dest_stride, BaseFloat inv_scale) {
+  cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride, inv_scale);
+}
+
+
+
 } // namespace kaldi
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-matrix-lib.h b/src/cudamatrix/cu-matrix-lib.h
index ef21a2945f1..1da7efafc97 100644
--- a/src/cudamatrix/cu-matrix-lib.h
+++ b/src/cudamatrix/cu-matrix-lib.h
@@ -29,5 +29,6 @@
 #include "cudamatrix/cu-sparse-matrix.h"
 #include "cudamatrix/cu-block-matrix.h"
 #include "cudamatrix/cu-rand.h"
+#include "cudamatrix/cu-compressed-matrix.h"
 
 #endif
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index a2517989294..551b50ff6ad 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -694,8 +694,8 @@ void ComputationChecker::CheckComputationCompression() const {
   // 'middle_command' will be the index of the command that separates
   // the forward and backward passes.
   int32 middle_command = -1;
-  for (size_t i = 0; i < computation->commands.size(); i++) {
-    if (computation->commands[i].command_type == kNoOperationMarker) {
+  for (size_t i = 0; i < computation_.commands.size(); i++) {
+    if (computation_.commands[i].command_type == kNoOperationMarker) {
         middle_command = static_cast<int32>(i);
         break;
     }
@@ -705,7 +705,7 @@ void ComputationChecker::CheckComputationCompression() const {
     int32 num_accesses = accesses.accesses.size();
     for (int32 a = 0; a < num_accesses; a++) {
       const Access &access = accesses.accesses[a];
-      int32 command_index = accesses.command_inex;
+      int32 command_index = access.command_index;
       const NnetComputation::Command &command =
           computation_.commands[command_index];
       if (command.command_type == kUncompressMatrix) {
@@ -715,7 +715,7 @@ void ComputationChecker::CheckComputationCompression() const {
             a > 0 && computation_.commands[
                 accesses.accesses[a-1].command_index].command_type ==
             kCompressMatrix);
-
+      }
       if (command.command_type == kCompressMatrix) {
         // check that the next access to this matrix is an uncompression
         // command.
@@ -730,14 +730,14 @@ void ComputationChecker::CheckComputationCompression() const {
           // make sure there are only 2 commands after this: the uncompress
           // command, a relu backprop command, and a deallocation command.
           KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 &&
-                       num_accesses <= a + 4);
-          // make sure the previous access to that matrix was a ReLU
-          // propagation.
-          int32 previous_command_index = accesses.accesses[a-1].command_index;
-          const NnetComputation::Command &previous_command =
-              computation_.commands[previous_command_index];
-          KALDI_ASSERT(previous_command.command_type == kPropagate &&
-                       nnet_.GetComponent(previous_command.arg1).Type() ==
+                       num_accesses == a + 4);
+          // make sure the next access to that matrix, apart from the
+          // uncompression command, is a ReLU propagation.
+          int32 next_command_index = accesses.accesses[a+2].command_index;
+          const NnetComputation::Command &next_command =
+              computation_.commands[next_command_index];
+          KALDI_ASSERT(next_command.command_type == kBackprop &&
+                       nnet_.GetComponent(next_command.arg1)->Type() ==
                        "RectifiedLinearComponent");
         }
       }
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 0d64165efe4..75521a43658 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -21,7 +21,6 @@
 #include "nnet3/nnet-optimize-utils.h"
 #include "nnet3/nnet-optimize.h"
 
-
 namespace kaldi {
 namespace nnet3 {
 
@@ -1016,6 +1015,211 @@ std::pair<bool,bool> VariableMergingOptimizer::MayBeMerged(
 }
 
 
+// This class is used inside the function
+// `void ExtendMatrices(NnetComputation *computation)`;
+// see that function's declaration in nnet-optimize-utils.h for
+// a summary of what this class does.
+class MatrixExtender {
+ public:
+  typedef NnetComputation::SubMatrixInfo SubMatrixInfo;
+  typedef NnetComputation::MatrixInfo MatrixInfo;
+
+  MatrixExtender(NnetComputation *computation);
+
+  void ExtendMatrices();
+
+ private:
+  // This function returns true if a copy command from 'src_submatrix'
+  // to 'dest_submatrix' has the properties we need to be able to
+  // extend its rows to cover all of the source matrix.
+  bool CanBeExtended(int32 dest_submatrix_index,
+                     int32 src_submatrix_index);
+
+  // This actually extends the matrices... it's called only if CanBeExtended()
+  // with the same args returned true.  It modifies 'dest_submatrix_index'
+  // and 'src_submatrix_index'.
+  void Extend(int32 *dest_submatrix_index, int32 *src_submatrix_index);
+
+  // This function modifies the computation to fix certain problems
+  // that might have been introduced by Extend()... allocation, deallocation,
+  //
+  void FixComputation();
+
+  // don't extend a destination matrix if it wasn't already
+  // at least 'min_proportion' (80%) big enough to store the source.
+  BaseFloat min_proportion_;
+
+  NnetComputation *computation_;
+
+  // Indexed by matrix-index m, orig_num_rows_[m] is the value of
+  // computation_->matrices[m].num_rows when this class was initialized,
+  // i.e. before we changed anything.
+  std::vector<int32> orig_num_rows_;
+
+  // Indexed by matrix-index m, this vector contains true if matrix
+  // m is involved in any AcceptInput() or ProvideOutput() operations.
+  std::vector<bool> is_input_or_output_;
+};
+
+MatrixExtender::MatrixExtender(NnetComputation *computation):
+    min_proportion_(0.8),
+    computation_(computation) {
+  int32 num_matrices = computation_->matrices.size();
+
+  { // set up orig_num_rows_.
+    orig_num_rows_.resize(num_matrices);
+    // matrix 0 is not a real matrix so skip that index.
+    for (int32 m = 1; m < num_matrices; m++)
+      orig_num_rows_[m] = computation_->matrices[m].num_rows;
+  }
+  { // set up is_input_or_output_.
+    is_input_or_output_.resize(num_matrices, false);
+    std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+    for (; command_iter != command_end; ++command_iter) {
+      const NnetComputation::Command &command = *command_iter;
+      // make sure there are no kSwapMatrix commands; they should not be present
+      // at this stage of optimization.
+      KALDI_ASSERT(command.command_type != kSwapMatrix);
+      if (command.command_type == kProvideOutput ||
+          command.command_type == kAcceptInput) {
+        int32 s = command.arg1,
+            m = computation_->submatrices[s].matrix_index;
+        is_input_or_output_[m] = true;
+      }
+    }
+  }
+}
+
+
+bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index,
+                                   int32 src_submatrix_index) {
+  const SubMatrixInfo
+      &src_submatrix = computation_->submatrices[src_submatrix_index],
+      &dest_submatrix = computation_->submatrices[dest_submatrix_index];
+  if (src_submatrix.matrix_index == dest_submatrix.matrix_index)
+    return false;
+
+  // we can't resize the destination matrix if it's involved in input or output.
+  if (is_input_or_output_[dest_submatrix.matrix_index])
+    return false;
+
+  const MatrixInfo
+      &src_matrix = computation_->matrices[src_submatrix.matrix_index];
+
+  int32 dest_matrix_num_rows = orig_num_rows_[dest_submatrix.matrix_index];
+
+  if (src_submatrix.num_rows < min_proportion_ * src_matrix.num_rows)
+    return false;
+
+  // The following checks that the source submatrix covers be all of the
+  // source matrix except a few final rows, and the destination submatrix goes
+  // to the final row of its matrix.
+  return (src_submatrix.col_offset == 0 &&
+          src_submatrix.num_cols == src_matrix.num_cols &&
+          src_submatrix.row_offset == 0 &&
+          src_submatrix.num_rows < src_matrix.num_rows &&
+          dest_submatrix.row_offset + dest_submatrix.num_rows ==
+          dest_matrix_num_rows);
+}
+
+
+void MatrixExtender::Extend(int32 *dest_submatrix_index,
+                            int32 *src_submatrix_index) {
+  // copy the SubMatrixInfo to avoid iterator invalidation.
+  SubMatrixInfo
+      src_submatrix = computation_->submatrices[*src_submatrix_index],
+      dest_submatrix = computation_->submatrices[*dest_submatrix_index];
+
+  MatrixInfo  &src_matrix = computation_->matrices[src_submatrix.matrix_index],
+      &dest_matrix = computation_->matrices[dest_submatrix.matrix_index];
+
+  int32 new_dest_num_rows = dest_submatrix.row_offset + src_matrix.num_rows;
+
+  // extend the destination matrix so it has enough rows to fit the entire
+  // source matrix.  Note: doing this will break certain invariances in the
+  // computation, principally with allocation and deallocation commands, which
+  // we'll later fix up by calling FixComputation().
+  if (new_dest_num_rows > dest_matrix.num_rows) {
+    dest_matrix.num_rows = new_dest_num_rows;
+    // make sure there's a submatrix index covering the whole of the dest matrix.
+    computation_->submatrices.push_back(
+        SubMatrixInfo(dest_submatrix.matrix_index, 0, new_dest_num_rows,
+                      0, dest_matrix.num_cols));
+  }
+
+  // The following 3 statements create a new submatrix that will be
+  // the destination submatrix; it's the same as the original destination
+  // submatrix, but with a few extra rows.
+  *dest_submatrix_index = computation_->submatrices.size();
+  dest_submatrix.num_rows = src_matrix.num_rows;
+  computation_->submatrices.push_back(
+      SubMatrixInfo(dest_submatrix));
+
+  // The following 3 statements create a new submatrix that will be
+  // the source submatrix; it's the same as the original source
+  // submatrix, but with a few extra rows, and actually will cover
+  // the entire source matrix.
+  *src_submatrix_index = computation_->submatrices.size();
+  computation_->submatrices.push_back(
+      SubMatrixInfo(src_submatrix.matrix_index, 0, src_matrix.num_rows,
+                    0, src_matrix.num_cols));
+}
+
+void MatrixExtender::ExtendMatrices() {
+  std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+  bool changed = false;
+  for (; command_iter != command_end; ++command_iter) {
+    NnetComputation::Command &command = *command_iter;
+    if (command.command_type == kMatrixCopy &&
+        command.alpha == 1.0) {
+      int32 dest_submatrix_index = command.arg1,
+          src_submatrix_index = command.arg2;
+      if (CanBeExtended(dest_submatrix_index, src_submatrix_index)) {
+        Extend(&command.arg1, &command.arg2);
+        changed = true;
+      }
+    }
+  }
+  if (changed)
+    FixComputation();
+}
+
+void MatrixExtender::FixComputation() {
+  // make sure that allocation and deallocation commands
+  // operate on whole matrix.
+  std::vector<NnetComputation::Command>::iterator
+      command_iter = computation_->commands.begin(),
+      command_end = computation_->commands.end();
+  std::vector<int32> whole_submatrices;
+  computation_->GetWholeSubmatrices(&whole_submatrices);
+  for (; command_iter != command_end; ++command_iter) {
+    NnetComputation::Command &command = *command_iter;
+    if (command.command_type == kAllocMatrix ||
+        command.command_type == kDeallocMatrix) {
+      int32 s = command.arg1,
+          m = computation_->submatrices[s].matrix_index,
+          new_s = whole_submatrices[m];
+      if (new_s != s) {
+        KALDI_ASSERT(orig_num_rows_[m] != computation_->matrices[m].num_rows);
+        command.arg1 = new_s;
+      }
+    }
+  }
+  RenumberComputation(computation_);
+}
+
+
+void ExtendMatrices(NnetComputation *computation) {
+  MatrixExtender ext(computation);
+  ext.ExtendMatrices();
+}
+
+
+
 /** This class is responsible for consolidating the model-update part of
     backprop commands, for components in (e.g.) recurrent networks that need to
     have many separate backprop commands, into more efficient single commands
@@ -2553,7 +2757,8 @@ static void ConvertNumNValues(int32 n_stride, int32 old_N, int32 new_N,
 
 // This class implements the internals of the ExpandComputation() function (used
 // in shortcut compilation); see comment by the declaration of
-// ExpandComputation() in nnet-optimize-utils.h for overview.
+// ExpandComputation() in nnet-optimize-utils.h for overview.  (It relates to
+// shortcut compilation).
 class ComputationExpander {
  public:
   ComputationExpander(const Nnet &nnet,
@@ -4062,7 +4267,7 @@ void InsertCommands(
       num_old_commands = computation->commands.size();
   if (num_new_commands == 0)
     return;
-  CommandPairOperator comparison_operator;
+  CommandPairComparator comparison_operator;
   // use std::stable_sort so that for entries in 'new_commands' that
   // have the same .first value, they stay in the same order they were
   // in before sorting.
@@ -4174,14 +4379,13 @@ class MemoryCompressionOptimizer {
 
   const Nnet &nnet_;
   int32 memory_compression_level_;
+  int32 middle_command_;
   NnetComputation *computation_;
   Analyzer analyzer_;
 };
 
 
 void MemoryCompressionOptimizer::ModifyComputation() {
-  int32 cur_num_commands = computation_->commands.size();
-
   // whole_submatrices[m] is the submatrix-index of the submatrix that
   // represents the whole of matrix m.
   std::vector<int32> whole_submatrices;
@@ -4221,14 +4425,21 @@ void MemoryCompressionOptimizer::Optimize() {
   for (int32 m = 1; m < num_matrices; m++)
     ProcessMatrix(m);
   if (!compress_info_.empty())
-    ModifyComputatin();
+    ModifyComputation();
 }
 
 void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
+  if (analyzer_.matrix_accesses[m].is_output) {
+    return;  // We can't do this optimization for matrices that are going to be
+             // output to the user.
+  }
+
   // 'accesses' list the commands that access this matrix.
   const std::vector<Access> &accesses = analyzer_.matrix_accesses[m].accesses;
-  Access middle_access;
-  middle_access.command_index = middle_command_;
+  // the 'kReadAccess' below is actually a don't-care  This is just
+  // to find the position in 'accesses' that corresponds to command-index
+  // 'middle_command'.
+  Access middle_access(middle_command_, kReadAccess);
   std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
                                                               accesses.end(),
                                                               middle_access);
@@ -4252,12 +4463,14 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
       &forward_access = iter[-1];
   KALDI_ASSERT(forward_access.command_index < middle_command_ &&
                backward_access.command_index > middle_command_);
+
   // 'backward_access_is_last_access' is going to be set to true if
   // 'backward_access' is the last command to access the matrix (apart from
   // deallocation commands).
   bool backward_access_is_last_access = false;
-  if (accesses.end() - backward_access <= 2) {
-    // if there is at most 1 command after 'backward_access'...
+  if (accesses.end() - iter <= 2) {
+    // if there is at most 1 command after 'backward_access' that accesses this
+    // matrix...
     const Access &next_access = iter[1];
     NnetComputation::Command &next_command =
         computation_->commands[next_access.command_index];
@@ -4268,21 +4481,17 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
   int32 backward_command_index = backward_access.command_index,
       forward_command_index = forward_access.command_index;
   NnetComputation::Command
-      &forward_command = computation_->commands[forward_command_index],
       &backward_command = computation_->commands[backward_command_index];
 
   if (memory_compression_level_ >= 1 &&
       backward_access_is_last_access &&
-      forward_access.access_type == kWriteAccess &&
       backward_access.access_type == kReadAccess &&
-      forward_command.command_type == kPropagate &&
       backward_command.command_type == kBackprop) {
-    int32 component_index = backward_access.arg1;
+    int32 component_index = backward_command.arg1;
     const Component *component = nnet_.GetComponent(component_index);
     // this is potentially a candidate for our optimization for ReLU units,
-    // where we only store the sign.
-    if (component->Type() == "RectifiedLinearComponent" &&
-        component_index == forward_access.arg1) {
+    // where we only need to store the sign.
+    if (component->Type() == "RectifiedLinearComponent") {
       compress_info_.push_back(
           MatrixCompressInfo(m, forward_command_index,
                              backward_command_index,
@@ -4291,9 +4500,21 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
     }
   }
 
-  // TODO: we can later implement compression for other cases.
-  //
+  // If memory_compression_level >= 2 (an "intermediate" level of compression),
+  // then we'll consider compressing quantities using 16 bits in the range
+  // [-10, 10].  Because of the way this compression works, exact zero will
+  // still be uncompressed as exact zero, so even if this is the output
+  // of a ReLU, it's OK.  (Having a few derivatives zero for ReLU outputs
+  // that were very close to zero is OK.)
+  if (memory_compression_level_ >= 2) {
+    compress_info_.push_back(
+        MatrixCompressInfo(m, forward_command_index,
+                           backward_command_index,
+                           kCompressedMatrixInt16, 10.0));
+    return;
+  }
 
+  // TODO: later maybe implement something for memory compression level = 3.
 }
 
 
@@ -4318,10 +4539,11 @@ void OptimizeMemoryCompression(const Nnet &nnet,
       if (middle_command < 0) {
         middle_command = static_cast<int32>(i);
       } else {
-        KALDI_WARN << "Found more than one command of tyep kNoOperationMarker "
+        KALDI_WARN << "Found more than one command of type kNoOperationMarker "
             "in non-looped computation.";
         // there are more than one command of this type... this wasn't expected.
-        return false;
+        // return (i.e. do nothing).
+        return;
       }
     }
   }
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 1ffcc330adf..93f3cdb128f 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -181,6 +181,16 @@ class VariableMergingOptimizer {
   bool already_called_merge_variables_;
 };
 
+/**
+   This is not really an optimization in itself but it can make things easier
+   for class VariableMergingOptimizer (usually called by its wrapper
+   VariableMergingOptimization()).  It looks for a case where most of a matrix
+   (but not its final rows) are copied to some submatrix of another matrix,
+   where the row-range of that submatrix extends to the last row of the other
+   matrix; and it extends the other matrix with additional rows so that the
+   entire source matrix can be copied to the destination.
+ */
+void ExtendMatrices(NnetComputation *computation);
 
 
 /**
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 3dff8c0a4f3..872624eaa7e 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -515,6 +515,14 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, true);
   }
 
+  if (config.optimize && config.extend_matrices &&
+      !config.optimize_looped_computation) {
+    ExtendMatrices(computation);
+    if (GetVerboseLevel() >= 3)
+      CheckComputation(nnet, *computation, false);
+  }
+
+
   if (config.optimize &&
       (config.remove_assignments || config.backprop_in_place ||
        config.propagate_in_place)) {
@@ -582,7 +590,7 @@ void Optimize(const NnetOptimizeOptions &config,
 
   if (config.memory_compression_level > 0 &&
       !config.optimize_looped_computation) {
-    OptimizeMemoryCompression(nnet_, config.memory_compression_level,
+    OptimizeMemoryCompression(nnet, config.memory_compression_level,
                               computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index d3ecf01847a..ba8efce0fe3 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -39,6 +39,7 @@ struct NnetOptimizeOptions {
   bool propagate_in_place;
   bool backprop_in_place;
   bool optimize_row_ops;
+  bool extend_matrices;
   bool convert_addition;
   bool remove_assignments;
   bool allow_left_merge;
@@ -62,6 +63,7 @@ struct NnetOptimizeOptions {
       propagate_in_place(true),
       backprop_in_place(true),
       optimize_row_ops(true),
+      extend_matrices(true),
       convert_addition(true),
       remove_assignments(true),
       allow_left_merge(true),
@@ -87,6 +89,9 @@ struct NnetOptimizeOptions {
                    "disable optimization that allows in-place propagation");
     opts->Register("backprop-in-place", &backprop_in_place, "Set to false to "
                    "disable optimization that allows in-place backprop");
+    opts->Register("extend-matrices", &extend_matrices, "This optimization "
+                   "can reduce memory requirements for TDNNs when applied "
+                   "together with --convert-addition=true");
     opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
                    "disable certain optimizations that act on operations of "
                    "type *Row*.");

From 0269f3634f5aa2f4166b680e86815fee78eaed04 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 27 Jan 2018 15:14:55 -0500
Subject: [PATCH 091/184] [src] Add cu-compressed-matrix.cc

---
 src/cudamatrix/Makefile                |   3 +-
 src/cudamatrix/cu-compressed-matrix.cc | 112 +++++++++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)
 create mode 100644 src/cudamatrix/cu-compressed-matrix.cc

diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index e6ade23728f..a57685eee06 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -11,7 +11,7 @@ TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test
 
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
-           cu-sparse-matrix.o cu-allocator.o cu-array.o
+           cu-sparse-matrix.o cu-allocator.o cu-array.o cu-compressed-matrix.o
 ifeq ($(CUDA), true)
   OBJFILES += cu-kernels.o
 endif
@@ -33,4 +33,3 @@ endif
 	$(CUDATKDIR)/bin/nvcc -c $< -o $@ $(CUDA_INCLUDE) $(CUDA_FLAGS) $(CUDA_ARCH) -I../
 
 include ../makefiles/default_rules.mk
-
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
new file mode 100644
index 00000000000..4ca52817271
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -0,0 +1,112 @@
+// cudamatrix/cu-compressed-matrix.cc
+
+// Copyright      2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#if HAVE_CUDA == 1
+#include <cuda_runtime_api.h>
+#include <cublas_v2.h>
+#endif
+
+#include "base/timer.h"
+#include "cudamatrix/cu-common.h"
+#include "cudamatrix/cu-vector.h"
+#include "cudamatrix/cu-device.h"
+#include "cudamatrix/cu-kernels.h"
+#include "cudamatrix/cu-array.h"
+#include "cudamatrix/cu-compressed-matrix.h"
+
+namespace kaldi {
+
+
+CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
+                                              BaseFloat range) {
+  if (t == kCompressedMatrixUint8) {
+    KALDI_ASSERT(range >= 0);
+    return new CuCompressedMatrix<uint8>(range);
+  } else if (t == kCompressedMatrixInt16) {
+    KALDI_ASSERT(range > 0);
+    return new CuCompressedMatrix<int16>(range);
+  }
+}
+
+template <typename I> CuCompressedMatrix::CuCompressedMatrix(BaseFloat range):
+    data_(NULL), scale_(range / std::numeric_limits<I>::max()),
+    num_rows_(0), num_cols_(0), stride_(0) {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+#endif
+  KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use "
+      "was not compiled in.";
+}
+
+
+template <typename I> void CuCompressedMatrix::Destroy() {
+#if HAVE_CUDA == 1
+  if (data_ != NULL) {
+    CuTimer tim;
+    CuDevice::Instantiate().Free(data_);
+    data_ = NULL;
+    num_rows_ = 0;
+    num_cols_ = 0;
+    stride_ = 0;
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  }
+#endif
+}
+
+template <typename I> void CuCompressedMatrix::CopyFromMat(
+    CuMatrixBase<BaseFloat> &mat) {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+  Destroy();
+  if (mat.NumRows() == 0)
+    return;
+  num_rows_ = mat.NumRows();
+  num_cols_ = mat.NumCold();
+  stride_ = num_cols_;
+
+  CuTimer tim;
+  data_ = CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_);
+
+  dim3 dimGrid, dimBlock;
+  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                        &dimGrid, &dimBlock);
+
+  if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others.
+    cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                           data_, stride_);
+  } else {
+    cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                      data_, stride_, 1.0 / scale_);
+
+  }
+
+    CU_SAFE_CALL(cudaGetLastError());
+
+
+
+  CuDevice::Instantiate().AccuProfile(CuCompressedMatrix::CopyFromMat(malloc),
+                                      tim);
+
+
+#endif
+}
+
+
+} // namespace kaldi

From 5aa698aab24b93b4bae211391e7543d74ab9e1ac Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 27 Jan 2018 19:29:52 -0500
Subject: [PATCH 092/184] [src] Finish and test CuCompressedMatrix code.

---
 src/cudamatrix/Makefile                     |   2 +-
 src/cudamatrix/cu-compressed-matrix-test.cc | 179 ++++++++++++++++++++
 src/cudamatrix/cu-compressed-matrix.cc      | 116 ++++++++-----
 src/cudamatrix/cu-compressed-matrix.h       |  39 +++--
 src/cudamatrix/cu-kernels-ansi.h            |  39 ++++-
 src/cudamatrix/cu-kernels.cu                | 157 ++++++++++++++---
 src/cudamatrix/cu-kernels.h                 |  53 +++++-
 7 files changed, 495 insertions(+), 90 deletions(-)
 create mode 100644 src/cudamatrix/cu-compressed-matrix-test.cc

diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index a57685eee06..ca831390ea9 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -7,7 +7,7 @@ LDLIBS += $(CUDA_LDLIBS)
 
 TESTFILES = cu-vector-test cu-matrix-test cu-math-test cu-test cu-sp-matrix-test cu-packed-matrix-test cu-tp-matrix-test \
             cu-block-matrix-test cu-matrix-speed-test cu-vector-speed-test cu-sp-matrix-speed-test cu-array-test \
-	    cu-sparse-matrix-test cu-device-test cu-rand-speed-test
+	    cu-sparse-matrix-test cu-device-test cu-rand-speed-test cu-compressed-matrix-test
 
 OBJFILES = cu-device.o cu-math.o cu-rand.o cu-matrix.o cu-packed-matrix.o cu-sp-matrix.o \
            cu-vector.o cu-common.o cu-tp-matrix.o cu-block-matrix.o \
diff --git a/src/cudamatrix/cu-compressed-matrix-test.cc b/src/cudamatrix/cu-compressed-matrix-test.cc
new file mode 100644
index 00000000000..3cbd7bd5060
--- /dev/null
+++ b/src/cudamatrix/cu-compressed-matrix-test.cc
@@ -0,0 +1,179 @@
+// cudamatrix/cu-compressed-matrix-test.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "cudamatrix/cu-matrix-lib.h"
+
+using namespace kaldi;
+
+
+namespace kaldi {
+
+void CuCompressedMatrixTestSign() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandn();
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(kCompressedMatrixUint8, 0.0);
+
+  // this just stores (M(i, j) > 0 ? 1 : 0).
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+  M.Heaviside(M);
+
+  AssertEqual(M, M2);
+  delete cm;
+}
+
+void CuCompressedMatrixTestNonnegative() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandUniform();
+
+  BaseFloat range = 0.5 * RandInt(1, 5);
+  M.Scale(range);
+
+  CuCompressedMatrixType t = (RandInt(0, 1) == 0 ?
+                              kCompressedMatrixUint8 :
+                              kCompressedMatrixUint16);
+
+  // since the input is in the correct range, truncating or not should make no
+  // difference.
+  bool truncate = (RandInt(0, 1) == 0);
+
+  BaseFloat extra_error = 0.0;
+  if (truncate && (RandInt(0, 1) == 0)) {
+    // this tests that with truncate == true, adding a small offset, which would
+    // take us outside the representable range, will not add too much extra
+    // error.  (with truncate == false this would not be true because we wouldn't
+    // round to the edges of the range, it would wrap around).
+    extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0);
+    M.Add(extra_error);
+  }
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate);
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+
+  M2.AddMat(-1.0, M);
+
+  BaseFloat diff_max = M2.Max(),
+      diff_min = M2.Min();
+
+  BaseFloat
+      headroom = 1.1,
+      max_expected_error = fabs(extra_error) + headroom * 0.5 *
+         range / (t == kCompressedMatrixUint8 ? 255 : 65535);
+
+  KALDI_ASSERT(diff_max < max_expected_error &&
+               diff_min > -1.0 * max_expected_error);
+
+  delete cm;
+}
+
+// this is like CuCompressedMatrixTestNonnegative but
+// with signed integers, and input in the range [-range, +range].
+void CuCompressedMatrixTestSymmetric() {
+  int32 num_rows = RandInt(80, 100),
+      num_cols = RandInt(80, 100);
+  CuMatrix<BaseFloat> M(num_rows, num_cols);
+  M.SetRandUniform();
+  M.Scale(2.0);
+  M.Add(-1.0);
+
+  BaseFloat range = 0.5 * RandInt(1, 5);
+  M.Scale(range);
+
+  CuCompressedMatrixType t = (RandInt(0, 1) == 0 ?
+                              kCompressedMatrixInt8 :
+                              kCompressedMatrixInt16);
+
+  // since the input is in the correct range, truncating or not should make no
+  // difference.
+  bool truncate = (RandInt(0, 1) == 0);
+
+  BaseFloat extra_error = 0.0;
+  if (truncate && (RandInt(0, 1) == 0)) {
+    // this tests that with truncate == true, adding a small offset, which would
+    // take us outside the representable range, will not add too much extra
+    // error.  (with truncate == false this would not be true because we wouldn't
+    // round to the edges of the range, it would wrap around).
+    extra_error = -0.01 * (RandInt(0, 1) == 0 ? 1.0 : -1.0);
+    M.Add(extra_error);
+  }
+
+  CuCompressedMatrixBase *cm = NewCuCompressedMatrix(t, range, truncate);
+
+  CuMatrix<BaseFloat> M2(num_rows, num_cols, kUndefined);
+
+  cm->CopyFromMat(M);
+  cm->CopyToMat(&M2);
+
+
+  M2.AddMat(-1.0, M);
+
+  BaseFloat diff_max = M2.Max(),
+      diff_min = M2.Min();
+
+  BaseFloat
+      headroom = 1.1,
+      max_expected_error = fabs(extra_error) + headroom * 0.5 *
+         range / (t == kCompressedMatrixInt8 ? 127 : 32767);
+
+  KALDI_ASSERT(diff_max < max_expected_error &&
+               diff_min > -1.0 * max_expected_error);
+
+  delete cm;
+}
+
+
+
+} // namespace kaldi
+
+
+int main() {
+  SetVerboseLevel(1);
+  // we don't run this test if CUDA is not compiled in, since
+  // you can't instantiate class CuCompressedMatrix in that case.
+#if HAVE_CUDA == 1
+  CuDevice::Instantiate().SelectGpuId("yes");
+  for (int32 i = 1; i < 10; i++) {
+    CuCompressedMatrixTestSign();
+    CuCompressedMatrixTestNonnegative();
+    CuCompressedMatrixTestSymmetric();
+  }
+
+#endif
+  return 0;
+}
diff --git a/src/cudamatrix/cu-compressed-matrix.cc b/src/cudamatrix/cu-compressed-matrix.cc
index 4ca52817271..be02921169d 100644
--- a/src/cudamatrix/cu-compressed-matrix.cc
+++ b/src/cudamatrix/cu-compressed-matrix.cc
@@ -34,79 +34,109 @@
 namespace kaldi {
 
 
-CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
-                                              BaseFloat range) {
-  if (t == kCompressedMatrixUint8) {
-    KALDI_ASSERT(range >= 0);
-    return new CuCompressedMatrix<uint8>(range);
-  } else if (t == kCompressedMatrixInt16) {
-    KALDI_ASSERT(range > 0);
-    return new CuCompressedMatrix<int16>(range);
-  }
-}
-
-template <typename I> CuCompressedMatrix::CuCompressedMatrix(BaseFloat range):
+template <typename I>
+CuCompressedMatrix<I>::CuCompressedMatrix(BaseFloat range, bool truncate):
     data_(NULL), scale_(range / std::numeric_limits<I>::max()),
-    num_rows_(0), num_cols_(0), stride_(0) {
+    truncate_(truncate), num_rows_(0), num_cols_(0), stride_(0) {
 #if HAVE_CUDA == 1
   KALDI_ASSERT(CuDevice::Instantiate().Enabled());
-#endif
+#else
   KALDI_ERR << "You instantiated CuCompressedMatrix while GPU use "
       "was not compiled in.";
+#endif
 }
 
-
-template <typename I> void CuCompressedMatrix::Destroy() {
+template <typename I>
+void CuCompressedMatrix<I>::Destroy() {
 #if HAVE_CUDA == 1
   if (data_ != NULL) {
-    CuTimer tim;
+    // we don't bother timing this because Free() won't normally have to
+    // access the GPU at all (due to caching).
     CuDevice::Instantiate().Free(data_);
     data_ = NULL;
     num_rows_ = 0;
     num_cols_ = 0;
     stride_ = 0;
-    CuDevice::Instantiate().AccuProfile(__func__, tim);
   }
 #endif
 }
 
-template <typename I> void CuCompressedMatrix::CopyFromMat(
-    CuMatrixBase<BaseFloat> &mat) {
+template <typename I>
+void CuCompressedMatrix<I>::CopyFromMat(
+    const CuMatrixBase<BaseFloat> &mat) {
 #if HAVE_CUDA == 1
   KALDI_ASSERT(CuDevice::Instantiate().Enabled());
-  Destroy();
   if (mat.NumRows() == 0)
     return;
-  num_rows_ = mat.NumRows();
-  num_cols_ = mat.NumCold();
-  stride_ = num_cols_;
-
-  CuTimer tim;
-  data_ = CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_);
-
-  dim3 dimGrid, dimBlock;
-  GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
-                                        &dimGrid, &dimBlock);
-
-  if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others.
-    cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(),
-                           data_, stride_);
-  } else {
-    cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(),
-                      data_, stride_, 1.0 / scale_);
-
+  if (num_rows_ != mat.NumRows() || num_cols_ != mat.NumCols()) {
+    Destroy();
+    num_rows_ = mat.NumRows();
+    num_cols_ = mat.NumCols();
+    data_ = static_cast<I*>(
+        CuDevice::Instantiate().Malloc(sizeof(I) * num_rows_ * num_cols_));
+    stride_ = num_cols_;
   }
 
+  {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+
+    if (scale_ == 0.0) { // scale == 0 calls a different kernel from the others.
+      cuda_mat_compress_sign(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                             data_, stride_);
+    } else {
+      cuda_mat_compress(dimGrid, dimBlock, mat.Data(), mat.Dim(),
+                        data_, stride_, float(1.0 / scale_),
+                        truncate_);
+    }
     CU_SAFE_CALL(cudaGetLastError());
 
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  }
+#endif
+}
 
-
-  CuDevice::Instantiate().AccuProfile(CuCompressedMatrix::CopyFromMat(malloc),
-                                      tim);
+template <typename I>
+void CuCompressedMatrix<I>::CopyToMat(CuMatrixBase<BaseFloat> *mat) const {
+#if HAVE_CUDA == 1
+  KALDI_ASSERT(CuDevice::Instantiate().Enabled());
+  KALDI_ASSERT(mat->NumRows() == num_rows_ && mat->NumCols() == num_cols_);
+  {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    BaseFloat scale = (scale_ == 0.0 ? 1.0 : scale_);
+    cuda_mat_uncompress(dimGrid, dimBlock, mat->Data(), mat->Dim(),
+                        data_, stride_, float(scale));
+  }
+#endif
+}
 
 
-#endif
+CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
+                                              BaseFloat range,
+                                              bool truncat) {
+  if (t == kCompressedMatrixUint8) {
+    KALDI_ASSERT(range >= 0);
+    return new CuCompressedMatrix<uint8>(range);
+  } else if (t == kCompressedMatrixInt8) {
+    KALDI_ASSERT(range >= 0);
+    return new CuCompressedMatrix<int8>(range);
+  } else if (t == kCompressedMatrixUint16) {
+    KALDI_ASSERT(range > 0);
+    return new CuCompressedMatrix<uint16>(range);
+  } else if (t == kCompressedMatrixInt16) {
+    KALDI_ASSERT(range > 0);
+    return new CuCompressedMatrix<int16>(range);
+  } else {
+    KALDI_ERR << "Unknown compressed-matrix type";
+    return NULL;
+  }
 }
 
 
+
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h
index 0be1bb391fb..2eafc20c6cc 100644
--- a/src/cudamatrix/cu-compressed-matrix.h
+++ b/src/cudamatrix/cu-compressed-matrix.h
@@ -38,20 +38,20 @@ class CuCompressedMatrixBase {
   /// Sets *this to an appropriately compressed copy of 'mat', which
   /// includes resizing *this.  The details of how this is done will be
   /// different in different child classes.
-  virtual void CopyFromMat(CuMatrixBase<BaseFloat> &mat) = 0;
+  virtual void CopyFromMat(const CuMatrixBase<BaseFloat> &mat) = 0;
 
   /// Copies the contents of *this to 'mat', which should be
   /// correctly sized beforehand.
-  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) = 0;
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) const = 0;
 
 
   // The number of rows in *this.
-  virtual int32 NumRows() = 0;
+  virtual int32 NumRows() const = 0;
 
   // The number of columns in *this.
-  virtual int32 NumCols() = 0;
+  virtual int32 NumCols() const = 0;
 
-  ~CuCompressedMatrixBase() { }
+  virtual ~CuCompressedMatrixBase() { }
 };
 
 
@@ -78,18 +78,25 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
   /// range = 0 (only supported for I == int8) is a special case in which only
   /// the sign of the input is retained; and when we reconstruct, the output
   /// will be -1, 0 or 1.
-  CuCompressedMatrix(BaseFloat range);
+  ///
+  /// truncate (only relevant if range != 0) should be true if it's possible
+  /// that the input could exceed the allowed input range, i.e. [0, range] if I
+  /// is unsigned, and [-range, range] if I is signed; and it may be false if
+  /// you know that the input (the matrix given to CopyFromMat) will have
+  /// elements only in the allowed range.  Setting 'truncate' to false
+  /// allows the compression code to avoid the bounds check.
+  CuCompressedMatrix(BaseFloat range, bool truncate = true);
 
-  virtual void CopyFromMat(CuMatrixBase<BaseFloat> &mat);
+  virtual void CopyFromMat(const CuMatrixBase<BaseFloat> &mat);
 
-  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat);
+  virtual void CopyToMat(CuMatrixBase<BaseFloat> *mat) const;
 
-  virtual MatrixIndexT NumRows() { return num_rows_; }
+  virtual MatrixIndexT NumRows() const { return num_rows_; }
 
-  virtual MatrixIndexT NumCols() { return num_cols_; }
+  virtual MatrixIndexT NumCols() const { return num_cols_; }
 
 
-  ~CuCompressedMatrix();
+  virtual ~CuCompressedMatrix() { Destroy(); }
 
  private:
   // If there was data in 'data_', frees it, and sets it to NULL.
@@ -109,6 +116,8 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
   // that the output becomes -1, 0 and 1.
   BaseFloat scale_;
 
+  bool truncate_;
+
   MatrixIndexT num_rows_;
   MatrixIndexT num_cols_;
   // stride_ is currently always equal to num_cols_; it was added mainly to
@@ -130,13 +139,15 @@ enum  CuCompressedMatrixType {
 
 /**
    This function allocates a new CuCompressedMatrix with type determined
-   by t, and with the 'range' parameter provided (range must be >= 0,
-   0 as a special case).
+   by t, and with the 'range' and 'truncate' parameters provided to the
+   constructor of class CuCompressedMatrix.
+
    It will crash at runtime if called when CUDA is not compiled in, or not
    enabled.
  */
 CuCompressedMatrixBase *NewCuCompressedMatrix(CuCompressedMatrixType t,
-                                              BaseFloat range);
+                                              BaseFloat range,
+                                              bool truncate = true);
 
 
 } // namespace kaldi
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 8a95ca09537..8ab03c7e14e 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -746,11 +746,40 @@ void cudaD_vec_sum(int Gr, int Bl, double* v, double* value, int dim, int inc);
 void cudaF_vec_sum(int Gr, int Bl, float* v, float* value, int dim, int inc);
 
 
-void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src,
-                                   MatrixDim dim, int16_t *dest,
-                                   int dest_stride, double inv_scale);
-void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
-                             unsigned char *dest, int dest_stride);
+void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, int16_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, uint16_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                          MatrixDim dim, uint8_t *dest,
+                          int dest_stride, float inv_scale,
+                          bool bounds_check);
+void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check);
+
+void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint8_t *dest, int dest_stride);
+
+void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int16_t *src,
+                           int src_stride, float scale);
+void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                            MatrixDim dim, const uint16_t *src,
+                            int src_stride, float scale);
+void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                          MatrixDim dim, const int8_t *src,
+                          int src_stride, float scale);
+void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                          MatrixDim dim, const uint8_t *src,
+                          int src_stride, float scale);
+
+
 
 } // extern "C"
 
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index f62a07d8917..b0468b7fa7c 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3560,44 +3560,102 @@ static void _diff_lstm_nonlinearity(const int cell_dim, const int have_dropout_m
 
 
 __global__
-static void _cuda_compress_int8_sign(const BaseFloat *src, MatrixDim dim,
-                                     unsigned char *dest, int dest_stride) {
+static void _cuda_compress_uint8_sign(const BaseFloat *src, MatrixDim dim,
+                                      unsigned char *dest, int dest_stride) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
   int dest_index = i + j * dest_stride,
       src_index = i + j * dim.stride;
-  if (i < d.cols && j < d.rows) {
+  if (i < dim.cols && j < dim.rows) {
     BaseFloat f = src[src_index];
     dest[dest_index] = (f > 0.0 ? (unsigned char)1 : (unsigned char)0);
   }
 }
 
 
-// this version of the function will only be used if BaseFloat is double.
+// The following inline templated functions are a workaround for the
+// fact that (I believe) std::numeric_limits is not available in CUDA;
+// they allow us to access the minimum and maximum elements of certain
+// types from templated code.
+template <typename I> __device__ static inline int minimum_integer_value();
+template <typename I> __device__ static inline int maximum_integer_value();
+
+template<> __device__ int maximum_integer_value<int8_t>() { return 127; }
+template<> __device__ int minimum_integer_value<int8_t>() { return -128; }
+template<> __device__ int maximum_integer_value<uint8_t>() { return 255; }
+template<> __device__ int minimum_integer_value<uint8_t>() { return 0; }
+template<> __device__ int maximum_integer_value<int16_t>() { return 32767; }
+template<> __device__ int minimum_integer_value<int16_t>() { return -32768; }
+template<> __device__ int maximum_integer_value<uint16_t>() { return 65535; }
+template<> __device__ int minimum_integer_value<uint16_t>() { return 0; }
+
+
+
+template <typename I>
 __global__
-static void _cuda_compress_double_to_int16(const double *src, MatrixDim dim,
-                                           int16_t *dest, int dest_stride,
-                                           double inv_scale) {
+static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
+                                        I *dest, int dest_stride, float inv_scale) {
   int i = blockIdx.x * blockDim.x + threadIdx.x;
   int j = blockIdx.y * blockDim.y + threadIdx.y;
   int dest_index = i + j * dest_stride,
       src_index = i + j * dim.stride;
-  int ok = (i < d.cols && j < d.rows);
+  const int min_value = minimum_integer_value<I>(),
+      max_value = maximum_integer_value<I>();
+  int16_t compressed_value;
+  int ok = (i < dim.cols && j < dim.rows);
   if  (ok) {
-    BaseFloat f = src[src_index];
-    int i = __double2int_rn(f * inv_scale);
+    float f = src[src_index];
+    // note: I'm not sure what __float2int_rn does if input is outside of
+    // integer range, but it doesn't matter much as in the situations where this
+    // type of compression would make sense, the input should be well inside the
+    // range of 'int', and if it fails, we've probably already catastrophically
+    // diverged.
+    int i = __float2int_rn(f * inv_scale);
     // note: SignedInt will be int8 or (more likely) int16.
-    int16_t s;
-    if (i < -32768) s = -32768;
-    else if (i > 32767)  s = 32767;
-    else s = i;
+    if (i < min_value) compressed_value = min_value;
+    else if (i > max_value) compressed_value = max_value;
+    else compressed_value = i;
   }
   __syncthreads();
   if (ok) {
+    dest[dest_index] = compressed_value;
+  }
+}
+
+
+template <typename I>
+__global__
+static void _cuda_compress_no_bounds_check(const BaseFloat *src, MatrixDim dim,
+                                           I *dest, int dest_stride,
+                                           float inv_scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int dest_index = i + j * dest_stride,
+      src_index = i + j * dim.stride;
+  if (i < dim.cols && j < dim.rows) {
+    float f = src[src_index];
+    int i = __float2int_rn(f * inv_scale);
+    I s = i;
     dest[dest_index] = s;
   }
 }
 
+template <typename I>
+__global__
+static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim,
+                             const I *src, int src_stride,
+                             float scale) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+  int src_index = i + j * src_stride,
+      dest_index = i + j * dim.stride;
+  if (i < dim.cols && j < dim.rows) {
+    I s = src[src_index];
+    dest[dest_index] = float(s * scale);
+  }
+}
+
+
 
 /***********************************************************************
  * ANSI-C wrappers of CUDA kernels
@@ -5261,14 +5319,69 @@ void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
   _apply_exp_special<<<Gr, Bl>>>(out, out_dim, in, in_stride);
 }
 
-void cuda_compress_int8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
-                             unsigned char *dest, int dest_stride) {
-  _cuda_compress_int8_sign<<<Gr, Bl>>>(src, dim, dest, dest_stride);
+void cuda_compress_uint8_sign(dim3 Gr, dim3 Bl, const BaseFloat *src, MatrixDim dim,
+                              unsigned char *dest, int dest_stride) {
+  _cuda_compress_uint8_sign<<<Gr, Bl>>>(src, dim, dest, dest_stride);
+}
+
+void cuda_compress_int16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int16_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_uint16(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, uint16_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_int8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, int8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
+}
+void cuda_compress_uint8(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                         MatrixDim dim, uint8_t *dest,
+                         int dest_stride, float inv_scale,
+                         bool bounds_check) {
+  if (bounds_check) {
+    _cuda_compress_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  } else {
+    _cuda_compress_no_bounds_check<<<Gr, Bl>>>(src, dim, dest, dest_stride, inv_scale);
+  }
 }
 
-void cuda_compress_double_to_int16(dim3 Gr, dim3 Bl, const double *src,
-                                   MatrixDim dim, int16_t *dest,
-                                   int dest_stride, double inv_scale) {
-  _cuda_compress_double_to_int16<<<Gr, Bl>>>(src, dim, dest, dest_stride,
-                                             inv_scale);
+void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const uint8_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_int8(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int8_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_uint16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                            MatrixDim dim, const uint16_t *src,
+                            int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
+}
+void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                           MatrixDim dim, const int16_t *src,
+                           int src_stride, float scale) {
+  _cuda_uncompress<<<Gr, Bl>>>(dest, dim, src, src_stride, scale);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index dba1a0516a3..3518e0c71ed 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -1466,9 +1466,9 @@ inline void cuda_vec_sum(int Gr, int Bl, float* v, float* value, int dim,
 // Compresses the matrix in 'src' to 'dest', retaining only zero-one
 // information (1 if the value is >0, 0 otherwise)
 inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
-                                   MatrixDim dim, unsigned char *dest,
+                                   MatrixDim dim, uint8 *dest,
                                    int dest_stride) {
-  cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride);
+  cuda_compress_uint8_sign(Gr, Bl, src, dim, dest, dest_stride);
 }
 // this template handles the other types that are not instantiated yet,
 // to avoid compilation errors.
@@ -1480,11 +1480,54 @@ inline void cuda_mat_compress_sign(dim3 Gr, dim3 Bl, const BaseFloat *src,
 }
 
 inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
-                              MatrixDim dim, unsigned char *dest,
-                              int dest_stride, BaseFloat inv_scale) {
-  cuda_int8_compress_sign(Gr, Bl, src, dim, dest, dest_stride, inv_scale);
+                              MatrixDim dim, int16_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_int16(Gr, Bl, src, dim, dest, dest_stride,
+                      inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint16_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_uint16(Gr, Bl, src, dim, dest, dest_stride,
+                       inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, uint8_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_uint8(Gr, Bl, src, dim, dest, dest_stride,
+                      inv_scale, bounds_check);
+}
+inline void cuda_mat_compress(dim3 Gr, dim3 Bl, const BaseFloat *src,
+                              MatrixDim dim, int8_t *dest,
+                              int dest_stride, float inv_scale,
+                              bool bounds_check) {
+  cuda_compress_int8(Gr, Bl, src, dim, dest, dest_stride,
+                     inv_scale, bounds_check);
 }
 
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const int8_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_int8(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const uint8_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_uint8(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const int16_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_int16(Gr, Bl, dest, dim, src, src_stride, scale);
+}
+inline void cuda_mat_uncompress(dim3 Gr, dim3 Bl, BaseFloat *dest,
+                                MatrixDim dim, const uint16_t *src,
+                                int src_stride, float scale) {
+  cuda_uncompress_uint16(Gr, Bl, dest, dim, src, src_stride, scale);
+}
 
 
 } // namespace kaldi

From 5dbfe97fe4a45fd0b4d753974393a38cf39a63f8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 00:44:42 -0500
Subject: [PATCH 093/184] [src] Various bug fixeS

---
 src/nnet3/nnet-analyze.cc        | 41 ++++++++++++-
 src/nnet3/nnet-analyze.h         |  1 +
 src/nnet3/nnet-common.cc         | 30 +++++++---
 src/nnet3/nnet-computation.cc    | 19 +++++++
 src/nnet3/nnet-computation.h     |  4 ++
 src/nnet3/nnet-compute.cc        | 14 ++++-
 src/nnet3/nnet-compute.h         |  2 +-
 src/nnet3/nnet-optimize-utils.cc | 98 +++++++++++++++++++++++++-------
 src/nnet3/nnet-optimize.cc       | 39 +++++++------
 src/nnet3/nnet-optimize.h        |  3 -
 10 files changed, 199 insertions(+), 52 deletions(-)

diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 551b50ff6ad..cd49e22b451 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -238,6 +238,23 @@ std::string ComputationVariables::DescribeVariable(int32 variable) const {
   return os.str();
 }
 
+NnetComputation::SubMatrixInfo ComputationVariables::VariableInfo(
+    int32 variable) const {
+  KALDI_ASSERT(variable >= 0 && variable < num_variables_);
+  int32 matrix_index = variable_to_matrix_[variable],
+      offset = variable - matrix_to_variable_index_[matrix_index],
+      num_column_variables = column_split_points_[matrix_index].size() - 1,
+      column_variable = offset % num_column_variables,
+      row_variable = offset / num_column_variables;
+  int32 row_offset = row_split_points_[matrix_index][row_variable],
+      num_rows = row_split_points_[matrix_index][row_variable+1] - row_offset,
+      col_offset = column_split_points_[matrix_index][column_variable],
+      num_cols = column_split_points_[matrix_index][column_variable+1] -
+                  col_offset;
+  return NnetComputation::SubMatrixInfo(matrix_index, row_offset, num_rows,
+                                        col_offset, num_cols);
+}
+
 
 /// given a vector of pairs from computation.indexes_multi_indexes
 /// containing paris (submatrix-index, row-index), this function outputs
@@ -622,6 +639,19 @@ void ComputationChecker::CheckComputationUndefined() const {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
     if (accesses.empty()) {
       if (config_.check_unused_variables) {
+        // Before we throw an error, we want to check that it isn't
+        // a case that can be produced by the ExtendMatrices()
+        // optimization, that is actually allowed.  This is a case
+        // when a variable is the last few rows of a matrix, but
+        // not all columns of those last rows.
+        NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v);
+        const NnetComputation::MatrixInfo &matrix_info =
+            computation_.matrices[info.matrix_index];
+        if (info.row_offset > 0 &&
+            info.num_rows + info.row_offset == matrix_info.num_rows &&
+            !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) {
+          continue;
+        }
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v) << " is never used.";
       }
@@ -728,9 +758,10 @@ void ComputationChecker::CheckComputationCompression() const {
           // alpha == 0.0 means we're only retaining the sign; we should
           // only do this if this is the output of a ReLU.
           // make sure there are only 2 commands after this: the uncompress
-          // command, a relu backprop command, and a deallocation command.
+          // command, and a relu backprop command.  (Any deallocation
+          // command doesn't show up in the list of 'accesses').
           KALDI_ASSERT(a > 0 && command.arg2 == kCompressedMatrixUint8 &&
-                       num_accesses == a + 4);
+                       num_accesses == a + 3);
           // make sure the next access to that matrix, apart from the
           // uncompression command, is a ReLU propagation.
           int32 next_command_index = accesses.accesses[a+2].command_index;
@@ -1004,14 +1035,18 @@ void ComputationChecker::CheckComputationIndexes() const {
         if (c.arg2 < static_cast<int32>(kCompressedMatrixInt8) ||
             c.arg2 > static_cast<int32>(kCompressedMatrixUint16))
           KALDI_ERR << "Invalid compressed-matrix type.";
+        if (c.arg3 != 0 && c.arg3 != 1)
+          KALDI_ERR << "Invalid 'truncate' option for compressing matrix.";
         if (c.alpha < 0.0 || c.alpha > 1000.0 ||
-            (c.alpha == 0.0 && c.arg1 != kCompressedMatrixInt8))
+            (c.alpha == 0.0 && c.arg2 != kCompressedMatrixUint8))
           KALDI_ERR << "Invalid alpha in kCompressMatrix command.";
+        break;
       }
       case kUncompressMatrix: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             !computation_.IsWholeMatrix(c.arg1))
           KALDI_ERR << "submatrix index out of range or invalid";
+        break;
       }
       case kAcceptInput: case kProvideOutput: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 2966cf947e4..2e1a9a33c0b 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -160,6 +160,7 @@ class ComputationVariables {
   // zero indexing): something like "m1" or "m1(0:99,:)" or "m1(0:19,10:49)"
   std::string DescribeVariable(int32 variable) const;
 
+  NnetComputation::SubMatrixInfo VariableInfo(int32 variable) const;
  private:
   // sets up split_points_, matrix_to_variable_index_, and num_variables_.
   // called from constructor.
diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc
index 75350d3d8f6..31ff9819dfa 100644
--- a/src/nnet3/nnet-common.cc
+++ b/src/nnet3/nnet-common.cc
@@ -440,6 +440,11 @@ void PrintIndexes(std::ostream &os,
     os << "[ ]";
     return;
   }
+  // If the string is longer than 'max_string_length' characters, it will
+  // be summarized with '...' in the middle.
+  size_t max_string_length = 200;
+  std::ostringstream os_temp;
+
   // range_starts will be the starts of ranges (with consecutive t values and
   // the same n value and zero x values) that we compactly print.  we'll append
   // "end" to range_starts for convenience.n
@@ -457,23 +462,32 @@ void PrintIndexes(std::ostream &os,
   }
   range_starts.push_back(cur_start);
   range_starts.push_back(end);
-  os << "[";
+  os_temp << "[";
   int32 num_ranges = range_starts.size() - 1;
   for (int32 r = 0; r < num_ranges; r++) {
     int32 range_start = range_starts[r], range_end = range_starts[r+1];
     KALDI_ASSERT(range_end > range_start);
-    os << "(" << indexes[range_start].n << ",";
+    os_temp << "(" << indexes[range_start].n << ",";
     if (range_end == range_start + 1)
-      os << indexes[range_start].t;
+      os_temp << indexes[range_start].t;
     else
-      os << indexes[range_start].t << ":" << indexes[range_end - 1].t;
+      os_temp << indexes[range_start].t << ":" << indexes[range_end - 1].t;
     if (indexes[range_start].x != 0)
-      os << "," << indexes[range_start].x;
-    os << ")";
+      os_temp << "," << indexes[range_start].x;
+    os_temp << ")";
     if (r + 1 < num_ranges)
-      os << ", ";
+      os_temp << ", ";
+  }
+  os_temp << "]";
+
+  std::string str = os_temp.str();
+  if (str.size() <= max_string_length) {
+    os << str;
+  } else {
+    size_t len = str.size();
+    os << str.substr(0, max_string_length / 2) << " ... "
+       << str.substr(len - max_string_length / 2);
   }
-  os << "]";
 }
 
 void PrintCindexes(std::ostream &ostream,
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 405faa56ede..40f9f2146a0 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -621,6 +621,25 @@ static void PrintCommand(std::ostream &os,
       os << "])\n";
       break;
     }
+    case kCompressMatrix: {
+      BaseFloat range = c.alpha;
+      std::string truncate = (c.arg3 != 0 ? "true" : "false");
+      std::string compressed_matrix_type;
+      if (c.arg2 == kCompressedMatrixInt8) { compressed_matrix_type = "int8"; }
+      else if (c.arg2 == kCompressedMatrixUint8) { compressed_matrix_type = "uint8"; }
+      else if (c.arg2 == kCompressedMatrixInt16) { compressed_matrix_type = "int16"; }
+      else {
+        KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16);
+        compressed_matrix_type = "uint16";
+      }
+      os << "CompressMatrix(" << submatrix_strings[c.arg1]
+         << range << ", " << compressed_matrix_type << ", "
+         << truncate << ")\n";
+      break;
+    }
+    case kUncompressMatrix:
+      os << "UncompressMatrix(" << submatrix_strings[c.arg1] << ")\n";
+      break;
     case kAcceptInput:
       os << submatrix_strings[c.arg1] << " = user input [for node: '"
          << nnet.GetNodeName(c.arg2) << "']\n";
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 01c51e8e822..d077f9a69c6 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -237,6 +237,10 @@ struct ComputationRequest {
        compression type (it's converted from the enum CuCompressed
        MatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha
        determines the 'range' parameter (c.f. NewCuCompressedMatrix()).
+       arg3 will be converted to the 'truncate' argument to the
+       class CuCompressedMatrix; it should be false (0) if you know that
+       the input is limited to the allowed range, and true (1) if the
+       input may exceed that range (see docs for CuCompresedMatrix).
    - kUncompressMatrix:  Uncompresses the matrix which is referred to
       by submatrix-index arg1 (it should previously have been compressed).
    - kAcceptInput: accepts a matrix of input from the user, which may be either
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index ad63043d851..f87b080fc43 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -391,8 +391,11 @@ void NnetComputer::ExecuteCommand() {
           int32 m = computation_.submatrices[c.arg1].matrix_index;
           KALDI_ASSERT(compressed_matrices_[m] == NULL &&
                        matrices_[m].NumRows() != 0);
+          BaseFloat range = c.alpha;
+          bool truncate = (c.arg3 != 0);
           compressed_matrices_[m] = NewCuCompressedMatrix(
-              static_cast<CuCompressedMatrixType>(c.arg2), c.alpha);
+              static_cast<CuCompressedMatrixType>(c.arg2),
+              range, truncate);
           compressed_matrices_[m]->CopyFromMat(matrices_[m]);
           matrices_[m].Resize(0, 0);
         }
@@ -668,5 +671,14 @@ void NnetComputer::AcceptInputs(const Nnet &nnet,
   }
 }
 
+NnetComputer::~NnetComputer() {
+  // Delete any pointers that are present in compressed_matrices_.  Actually
+  // they should all already have been deallocated and set to NULL if the
+  // compuation was run to completion; we do this in case someone ran
+  // the forward propagation but not the backprop.
+  for (size_t i = 0; i < compressed_matrices_.size(); i++)
+    delete compressed_matrices_[i];
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index 19af856bad8..9f1860c656d 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -125,6 +125,7 @@ class NnetComputer {
                             CuMatrix<BaseFloat> *output);
 
 
+  ~NnetComputer();
  private:
   void Init(); // called from constructors.
 
@@ -239,7 +240,6 @@ class NnetComputer {
   // memos are not reusable.
   inline void *GetMemo(int32 memo_index);
 
- private:
   NnetComputer &operator = (const NnetComputer &other);  // Disallow.
 };
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 75521a43658..027781d64ad 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1042,9 +1042,12 @@ class MatrixExtender {
 
   // This function modifies the computation to fix certain problems
   // that might have been introduced by Extend()... allocation, deallocation,
-  //
   void FixComputation();
 
+  // This function modifies the computation to fix the debug info; if needed,
+  // it's called from FixComputation().
+  void FixDebugInfo();
+
   // don't extend a destination matrix if it wasn't already
   // at least 'min_proportion' (80%) big enough to store the source.
   BaseFloat min_proportion_;
@@ -1204,14 +1207,64 @@ void MatrixExtender::FixComputation() {
           m = computation_->submatrices[s].matrix_index,
           new_s = whole_submatrices[m];
       if (new_s != s) {
-        KALDI_ASSERT(orig_num_rows_[m] != computation_->matrices[m].num_rows);
+        KALDI_ASSERT(
+            computation_->submatrices[s] == computation_->submatrices[new_s] ||
+            orig_num_rows_[m] != computation_->matrices[m].num_rows);
+        command.arg1 = new_s;
+      }
+    }
+    if (command.command_type == kSetConst && command.alpha == 0.0) {
+      int32 s = command.arg1,
+          m = computation_->submatrices[s].matrix_index,
+          new_s = whole_submatrices[m];
+      if (new_s != s) {
+        {
+          const NnetComputation::SubMatrixInfo &info = computation_->submatrices[
+              command.arg1];
+          const NnetComputation::MatrixInfo &mat_info = computation_->matrices[
+              info.matrix_index];
+          // If this command wasn't zeroing the the entirety of a matrix,
+          // (before we extended the matrix), we don't need to extend it.
+          if (!(info.row_offset == 0 && info.col_offset == 0 &&
+                info.num_cols == mat_info.num_cols &&
+                info.num_rows == orig_num_rows_[info.matrix_index]))
+            continue;
+          // I know doing this via 'continue' is odd, but it's done this way to
+          // avoid invalid iterators still being in scope; I think some runtimes
+          // check for it.
+        }
         command.arg1 = new_s;
       }
     }
   }
+  if (!computation_->matrix_debug_info.empty())
+    FixDebugInfo();
   RenumberComputation(computation_);
 }
 
+void MatrixExtender::FixDebugInfo() {
+  int32 num_matrices = computation_->matrices.size();
+  // matrix zero is not a 'real' matrix.
+  for (int32 m = 1; m < num_matrices; m++) {
+    NnetComputation::MatrixDebugInfo &debug_info =
+        computation_->matrix_debug_info[m];
+    int32 new_num_rows = computation_->matrices[m].num_rows,
+        old_num_rows = debug_info.cindexes.size();
+    if (new_num_rows != old_num_rows) {
+      debug_info.cindexes.resize(new_num_rows);
+      int32 num_extra_rows = new_num_rows - old_num_rows;
+      // the following should be true because min_proportion_ > 0.5.
+      KALDI_ASSERT(num_extra_rows <= old_num_rows);
+      for (int32 r = old_num_rows; r < new_num_rows; r++) {
+        Cindex cindex = debug_info.cindexes[r - num_extra_rows];
+        // set the 't' value to kNoTime which indicates that it's not a 'real'
+        // time step, and may avoid errors in checking code.
+        cindex.second.t = kNoTime;
+        debug_info.cindexes[r] = cindex;
+      }
+    }
+  }
+}
 
 void ExtendMatrices(NnetComputation *computation) {
   MatrixExtender ext(computation);
@@ -3155,6 +3208,7 @@ void ComputationExpander::ComputeCommands() {
       case kAddRowRanges:
         ExpandRowRangesCommand(c, &c_out);
         break;
+      case kCompressMatrix: case kUncompressMatrix:
       case kAcceptInput: case kProvideOutput: case kNoOperation:
       case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel: case kGotoLabel:
@@ -4365,14 +4419,18 @@ class MemoryCompressionOptimizer {
     // sign (-1, 0 or 1) of the input, and decompresses it to -1, 0 or 1; this
     // is useful for ReLUs.
     BaseFloat range;
-
+    // this is provided to the initializer of CuCompressedMatrix; it should
+    // be true if the values being compressed are potentially outside of
+    // the representable range.
+    bool truncate;
     MatrixCompressInfo(int32 m, int32 forward_command_index,
                        int32 backward_command_index,
                        CuCompressedMatrixType compression_type,
-                       BaseFloat range):
+                       BaseFloat range, bool truncate):
         m(m), compression_command_index(forward_command_index),
         uncompression_command_index(backward_command_index),
-        compression_type(compression_type), range(range) { }
+        compression_type(compression_type), range(range),
+        truncate(truncate) { }
 
   };
   std::vector<MatrixCompressInfo> compress_info_;
@@ -4406,7 +4464,8 @@ void MemoryCompressionOptimizer::ModifyComputation() {
     std::pair<int32, NnetComputation::Command> p1(
         info.compression_command_index + 1,
         NnetComputation::Command(info.range, kCompressMatrix,
-                                 s, static_cast<int32>(info.compression_type)));
+                                 s, static_cast<int32>(info.compression_type),
+                                 info.truncate ? 1 : 0));
     pairs_to_insert.push_back(p1);
     std::pair<int32, NnetComputation::Command> p2(
         info.uncompression_command_index,
@@ -4443,6 +4502,11 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
   std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
                                                               accesses.end(),
                                                               middle_access);
+
+  if (m == 84) {
+    KALDI_LOG << "m == 84"; //TEMP
+  }
+
   // At this point, 'iter' points to the first access in 'accesses'
   // whose command index is >= 'middle_command_' (which separates the forward
   // and backward passes), or accesses.end() if this matrix was not
@@ -4466,18 +4530,10 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
 
   // 'backward_access_is_last_access' is going to be set to true if
   // 'backward_access' is the last command to access the matrix (apart from
-  // deallocation commands).
-  bool backward_access_is_last_access = false;
-  if (accesses.end() - iter <= 2) {
-    // if there is at most 1 command after 'backward_access' that accesses this
-    // matrix...
-    const Access &next_access = iter[1];
-    NnetComputation::Command &next_command =
-        computation_->commands[next_access.command_index];
-    if (next_command.command_type == kDeallocMatrix ||
-        next_command.command_type == kSwapMatrix)
-      backward_access_is_last_access = true;
-  }
+  // deallocation or matrix-swap commands, which don't show up in the list of
+  // accesses).
+  bool backward_access_is_last_access = (accesses.end() == iter + 1);
+
   int32 backward_command_index = backward_access.command_index,
       forward_command_index = forward_access.command_index;
   NnetComputation::Command
@@ -4495,7 +4551,8 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
       compress_info_.push_back(
           MatrixCompressInfo(m, forward_command_index,
                              backward_command_index,
-                             kCompressedMatrixUint8, 0.0));
+                             kCompressedMatrixUint8, 0.0,
+                             true));
       return;
     }
   }
@@ -4510,7 +4567,8 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
     compress_info_.push_back(
         MatrixCompressInfo(m, forward_command_index,
                            backward_command_index,
-                           kCompressedMatrixInt16, 10.0));
+                           kCompressedMatrixInt16, 10.0,
+                           true));
     return;
   }
 
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index 872624eaa7e..a67f0090ef7 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -478,7 +478,7 @@ void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
               int32 max_output_time_in_request,
               NnetComputation *computation) {
-  if (GetVerboseLevel() >= 3) {
+  if (GetVerboseLevel() >= 1) { // TEMP, should be 3
     CheckComputation(nnet, *computation, true);
     KALDI_LOG << "Before optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);
@@ -515,6 +515,20 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, true);
   }
 
+
+  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
+    bool must_renumber = false;
+    if (config.snip_row_ops && SnipRowOps(computation))
+      must_renumber = true;
+    if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
+      must_renumber = true;
+    if (must_renumber) {
+      RenumberComputation(computation);
+      if (GetVerboseLevel() >= 3)
+        CheckComputation(nnet, *computation, false);
+    }
+  }
+
   if (config.optimize && config.extend_matrices &&
       !config.optimize_looped_computation) {
     ExtendMatrices(computation);
@@ -531,20 +545,6 @@ void Optimize(const NnetOptimizeOptions &config,
       CheckComputation(nnet, *computation, false);
   }
 
-  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
-    bool must_renumber = false;
-    if (config.snip_row_ops && SnipRowOps(computation))
-      must_renumber = true;
-    if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
-      must_renumber = true;
-    if (must_renumber) {
-      RenumberComputation(computation);
-      if (GetVerboseLevel() >= 3)
-        CheckComputation(nnet, *computation, false);
-    }
-  }
-
-
   if (config.optimize && config.initialize_undefined) {
     RemoveUnnecessaryZeroing(nnet, computation);
     if (GetVerboseLevel() >= 3)
@@ -594,13 +594,20 @@ void Optimize(const NnetOptimizeOptions &config,
                               computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
+
+    { // TEMP
+      std::ostringstream os;
+      computation->Print(os, nnet);
+      KALDI_LOG << "Compuation after adding memory compression is: " << os.str();
+    }
   }
 
-  if (GetVerboseLevel() >= 3) {
+  if (GetVerboseLevel() >= 1) { // TEMP, should be 3
     CheckComputation(nnet, *computation, false);
     KALDI_LOG << "After optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);
   }
+
 }
 
 // ComputationRequests are distinguished by the names and indexes
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index ba8efce0fe3..cf308dd3b00 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -131,9 +131,6 @@ struct NnetOptimizeOptions {
     opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
                    "disable an optimization that reduces the size of certain "
                    "per-row operations");
-    opts->Register("snip-row-ops", &snip_row_ops, "Set this to false to "
-                   "disable an optimization that reduces the size of certain "
-                   "per-row operations");
     opts->Register("memory-compression-level", &memory_compression_level,
                    "This is only relevant to training, not decoding.  Set this "
                    "to 0,1,2,3; higher levels are more aggressive at reducing "

From 0f14373f479099f894062ba714bf607479170145 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 14:15:22 -0500
Subject: [PATCH 094/184] [src] Work around problem related to ungetc failures
 on ifstream (#2194)

---
 src/base/io-funcs.cc | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/base/io-funcs.cc b/src/base/io-funcs.cc
index 8b8662b6401..90988faf3ea 100644
--- a/src/base/io-funcs.cc
+++ b/src/base/io-funcs.cc
@@ -178,8 +178,14 @@ int PeekToken(std::istream &is, bool binary) {
   }
   int ans = is.peek();
   if (read_bracket) {
-    if (!is.unget())
+    if (!is.unget()) {
       KALDI_WARN << "Error ungetting '<' in PeekToken";
+      // Clear the bad bit.  It seems to be possible for this code to be
+      // reached, and the C++ standard is very vague on whether even a single
+      // call to unget() should succeed; see
+      // http://www.cplusplus.com/reference/istream/istream/unget/
+      is.clear();
+    }
   }
   return ans;
 }
@@ -197,7 +203,12 @@ void ExpectToken(std::istream &is, bool binary, const char *token) {
     KALDI_ERR << "Failed to read token [started at file position "
               << pos_at_start << "], expected " << token;
   }
-  if (strcmp(str.c_str(), token) != 0) {
+  // The second half of the '&&' expression below is so that if we're expecting
+  // "<Foo>", we will accept "Foo>" instead.  This is so that the model-reading
+  // code will tolerate errors in PeekToken where is.unget() failed; search for
+  // is.clear() in PeekToken() for an explanation.
+  if (strcmp(str.c_str(), token) != 0 &&
+      !(token[0] == '<' && strcmp(str.c_str(), token + 1) == 0)) {
     KALDI_ERR << "Expected token \"" << token << "\", got instead \""
               << str <<"\".";
   }

From 6b62e0a0459fd6c1708dab4f193b9f6fa404bbc9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 16:43:52 -0500
Subject: [PATCH 095/184] [src] Small fix to component reading, workaround for
 ungetc() issue.

---
 src/nnet3/nnet-simple-component.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 01adb222372..b3cf89ae6b4 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2725,8 +2725,8 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   }
   std::string token;
   ReadToken(is, binary, &token);
-  if (token != "<NaturalGradientAffineComponent>" &&
-      token != "</NaturalGradientAffineComponent>")
+  // the following has to handle a couple variants of
+  if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
     KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
               << "</NaturalGradientAffineComponent>, got " << token;
 }

From 30e9a90d30d2007b30698a6351c9a36df1acf2ad Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 16:43:52 -0500
Subject: [PATCH 096/184] [src] Small fix to component reading, workaround for
 ungetc() issue.

---
 src/nnet3/nnet-simple-component.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 91906ac1ddf..c6d2c1f7952 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -2854,8 +2854,8 @@ void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) {
   }
   std::string token;
   ReadToken(is, binary, &token);
-  if (token != "<NaturalGradientAffineComponent>" &&
-      token != "</NaturalGradientAffineComponent>")
+  // the following has to handle a couple variants of
+  if (token.find("NaturalGradientAffineComponent>") == std::string::npos)
     KALDI_ERR << "Expected <NaturalGradientAffineComponent> or "
               << "</NaturalGradientAffineComponent>, got " << token;
   SetNaturalGradientConfigs();

From b9fc15171319b5c0f0d0cfb16b4201524523adc9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 19:29:52 -0500
Subject: [PATCH 097/184] [src] Some small cosmetic changes

---
 src/chain/chain-denominator.cc | 2 +-
 src/chain/chain-denominator.h  | 3 +--
 src/nnet3/nnet-utils.cc        | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 2b27d4b9176..620ea873eb7 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -260,7 +260,7 @@ bool DenominatorComputation::Backward(
       BetaGeneralFrameDebug(t);
     Beta(t);
     if (t % kMaxDerivTimeSteps == 0) {
-      // commit the derivative stored in exp_nnet_output_transposed_ by adding
+      // commit the derivative stored in nnet_output_deriv_transposed_ by adding
       // its transpose to the appropriate sub-matrix of 'nnet_output_deriv'.
       int32 chunk_frames = std::min<int32>(static_cast<int32>(kMaxDerivTimeSteps),
                                            frames_per_sequence_ - t),
diff --git a/src/chain/chain-denominator.h b/src/chain/chain-denominator.h
index a4a417c8a5d..f44588e434f 100644
--- a/src/chain/chain-denominator.h
+++ b/src/chain/chain-denominator.h
@@ -51,7 +51,7 @@ namespace chain {
 
   All this is done in parallel over multiple sequences, but the computations
   are independent over the separate sequences, so we won't introduce any notation
-  or index for the sequence; we'll just explain it for one sequences.
+  or index for the sequence; we'll just explain it for one sequence.
 
   Suppose we have I hmm-states, numbered i = 0 ... I-1 (we'll use i and j for
   hmm-state indexes).  Let foll(i) give a list of arcs leaving state i, and
@@ -313,4 +313,3 @@ class DenominatorComputation {
 }  // namespace kaldi
 
 #endif  // KALDI_CHAIN_CHAIN_DENOMINATOR_H_
-
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index cc5762474d6..59885cf70b2 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -888,7 +888,7 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
 
   if (GetVerboseLevel() >= 1) {
     BaseFloat error = P.FrobeniusNorm();
-    KALDI_VLOG(1) << "Error in orthogonality is " << error;
+    KALDI_VLOG(2) << "Error in orthogonality is " << error;
   }
 
   // At this point, the matrix P contains what, in the math, would be Q =

From 46cdd54f4b9fb6124acefb92258f9d3ce57d82de Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 19:32:57 -0500
Subject: [PATCH 098/184] [src] Various fixes w.r.t memory compression;
 cosmetic fixes too.

---
 src/nnet3/nnet-analyze.cc        | 28 +++++++++++++++-------------
 src/nnet3/nnet-analyze.h         |  2 +-
 src/nnet3/nnet-computation.cc    | 26 +++++++++++++++++++-------
 src/nnet3/nnet-computation.h     |  4 ++--
 src/nnet3/nnet-compute.cc        |  2 +-
 src/nnet3/nnet-compute.h         |  2 +-
 src/nnet3/nnet-optimize-utils.cc |  9 ++-------
 src/nnet3/nnet-optimize-utils.h  |  2 +-
 src/nnet3/nnet-optimize.cc       | 10 ++--------
 9 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index cd49e22b451..cf48d3d86c6 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -388,7 +388,7 @@ void ComputeCommandAttributes(
         vars.RecordAccessForSubmatrix(c.arg1, kReadWriteAccess, &attr);
         break;
       }
-      case kUncompressMatrix: {
+      case kDecompressMatrix: {
         vars.RecordAccessForSubmatrix(c.arg1, kWriteAccess, &attr);
         break;
       }
@@ -656,7 +656,11 @@ void ComputationChecker::CheckComputationUndefined() const {
                   << a_.variables.DescribeVariable(v) << " is never used.";
       }
     } else {
-      if (accesses[0].access_type != kWriteAccess)
+      // It's OK if part of a matrix is compressed, that is undefined;
+      // likely that part won't be referred to when we uncompress.
+      if (accesses[0].access_type != kWriteAccess &&
+          !(computation_.commands[accesses[0].command_index].command_type ==
+            kCompressMatrix))
         KALDI_ERR << "Variable " << v << " == "
                   << a_.variables.DescribeVariable(v)
                   << " is read before it is written to";
@@ -738,7 +742,7 @@ void ComputationChecker::CheckComputationCompression() const {
       int32 command_index = access.command_index;
       const NnetComputation::Command &command =
           computation_.commands[command_index];
-      if (command.command_type == kUncompressMatrix) {
+      if (command.command_type == kDecompressMatrix) {
         // check that the previous access to this matrix was a compression
         // command.
         KALDI_ASSERT(
@@ -751,7 +755,7 @@ void ComputationChecker::CheckComputationCompression() const {
         // command.
         int32 next_command_index = accesses.accesses[a+1].command_index;
         KALDI_ASSERT(computation_.commands[next_command_index].command_type ==
-                     kUncompressMatrix &&
+                     kDecompressMatrix &&
                      command_index < middle_command &&
                      next_command_index > middle_command);
         if (command.alpha == 0.0) {
@@ -1042,7 +1046,7 @@ void ComputationChecker::CheckComputationIndexes() const {
           KALDI_ERR << "Invalid alpha in kCompressMatrix command.";
         break;
       }
-      case kUncompressMatrix: {
+      case kDecompressMatrix: {
         if (c.arg1 < 1 || c.arg1 >= num_submatrices ||
             !computation_.IsWholeMatrix(c.arg1))
           KALDI_ERR << "submatrix index out of range or invalid";
@@ -1445,13 +1449,11 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       this_num_bytes = static_cast<int64>(sizeof(BaseFloat)) *
           submat_info.num_rows * submat_info.num_cols;
 
-      if (c.arg2 >= static_cast<int32>(kCompressedMatrixInt8) &&
-          c.arg2 <= static_cast<int32>(kCompressedMatrixUint16)) {
-        this_compressed_num_bytes =
-            ((c.arg2 == static_cast<int32>(kCompressedMatrixInt8) ||
-             c.arg2 == static_cast<int32>(kCompressedMatrixUint8)) ?
-             1 : 2) * submat_info.num_rows * submat_info.num_cols;
-      }
+      this_compressed_num_bytes =
+          ((c.arg2 == static_cast<int32>(kCompressedMatrixInt8) ||
+            c.arg2 == static_cast<int32>(kCompressedMatrixUint8)) ?
+           1 : 2) * static_cast<int64>(submat_info.num_rows) *
+          submat_info.num_cols;
     }
     switch (c.command_type) {
       case kAllocMatrix:
@@ -1464,7 +1466,7 @@ int64 GetMaxMemoryUse(const NnetComputation &computation) {
       case kCompressMatrix:
         cur_memory_use += this_compressed_num_bytes - this_num_bytes;
         break;
-      case kUncompressMatrix:
+      case kDecompressMatrix:
         cur_memory_use += this_num_bytes - this_compressed_num_bytes;
         break;
       default:
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 2e1a9a33c0b..77466039756 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -423,7 +423,7 @@ class ComputationChecker {
   void CheckComputationRewrite() const;
   // Check matrix accesses make sense.
   void CheckComputationMatrixAccesses() const;
-  // Some checks related to the kCompressMatrix and kUncompressMatrix commands.
+  // Some checks related to the kCompressMatrix and kDecompressMatrix commands.
   void CheckComputationCompression() const;
   // Check debug_info has the correct size, if used.
   void CheckComputationDebugInfo() const;
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 40f9f2146a0..a9a21bb3f24 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -284,8 +284,8 @@ void NnetComputation::Command::Read(std::istream &is, bool binary) {
       command_type = kAddRowRanges;
     } else if (command_type_str == "kCompressMatrix") {
       command_type = kCompressMatrix;
-    } else if (command_type_str == "kUncompressMatrix") {
-      command_type = kUncompressMatrix;
+    } else if (command_type_str == "kDecompressMatrix") {
+      command_type = kDecompressMatrix;
     } else if (command_type_str == "kAcceptInput") {
       command_type = kAcceptInput;
     } else if (command_type_str == "kProvideOutput") {
@@ -382,8 +382,8 @@ void NnetComputation::Command::Write(std::ostream &os, bool binary) const {
       case kCompressMatrix:
         os << "kCompressMatrix\n";
         break;
-      case kUncompressMatrix:
-        os << "kUncompressMatrix\n";
+      case kDecompressMatrix:
+        os << "kDecompressMatrix\n";
         break;
       case kAcceptInput:
         os << "kAcceptInput\n";
@@ -510,13 +510,17 @@ static void GetIndexesMultiStrings(
 
 
 // writes to "os" the statement for this command.
-static void PrintCommand(std::ostream &os,
+static void PrintCommand(std::ostream &os_out,
                          const Nnet &nnet,
                          const NnetComputation &computation,
                          int32 command_index,
                          const std::vector<std::string> &submatrix_strings,
                          const std::vector<std::string> &indexes_strings,
                          const std::vector<std::string> &indexes_multi_strings) {
+  // If the string is longer than 'max_string_length' characters, it will
+  // be summarized with '...' in the middle.
+  size_t max_string_length = 200;
+  std::ostringstream os;
   KALDI_ASSERT(command_index < computation.commands.size());
   os << "c" << command_index << ": ";
   const NnetComputation::Command &c = computation.commands[command_index];
@@ -637,8 +641,8 @@ static void PrintCommand(std::ostream &os,
          << truncate << ")\n";
       break;
     }
-    case kUncompressMatrix:
-      os << "UncompressMatrix(" << submatrix_strings[c.arg1] << ")\n";
+    case kDecompressMatrix:
+      os << "DecompressMatrix(" << submatrix_strings[c.arg1] << ")\n";
       break;
     case kAcceptInput:
       os << submatrix_strings[c.arg1] << " = user input [for node: '"
@@ -666,6 +670,14 @@ static void PrintCommand(std::ostream &os,
     default:
       KALDI_ERR << "Un-handled command type.";
   }
+  std::string str = os.str();
+  if (str.size() <= max_string_length) {
+    os_out << str;
+  } else {
+    size_t len = str.size();
+    os_out << str.substr(0, max_string_length / 2) << " ... "
+           << str.substr(len - max_string_length / 2);
+  }
 }
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index d077f9a69c6..0c6c690684a 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -241,7 +241,7 @@ struct ComputationRequest {
        class CuCompressedMatrix; it should be false (0) if you know that
        the input is limited to the allowed range, and true (1) if the
        input may exceed that range (see docs for CuCompresedMatrix).
-   - kUncompressMatrix:  Uncompresses the matrix which is referred to
+   - kDecompressMatrix:  Decompresses the matrix which is referred to
       by submatrix-index arg1 (it should previously have been compressed).
    - kAcceptInput: accepts a matrix of input from the user, which may be either
      features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
@@ -274,7 +274,7 @@ enum CommandType {
   kPropagate, kBackprop, kBackpropNoModelUpdate,
   kMatrixCopy, kMatrixAdd, kCopyRows, kAddRows,
   kCopyRowsMulti, kCopyToRowsMulti, kAddRowsMulti, kAddToRowsMulti,
-  kAddRowRanges, kCompressMatrix, kUncompressMatrix,
+  kAddRowRanges, kCompressMatrix, kDecompressMatrix,
   kAcceptInput, kProvideOutput,
   kNoOperation, kNoOperationPermanent, kNoOperationMarker, kNoOperationLabel,
   kGotoLabel };
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index f87b080fc43..835b7fcfd88 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -401,7 +401,7 @@ void NnetComputer::ExecuteCommand() {
         }
 #endif
       }
-      case kUncompressMatrix: {
+      case kDecompressMatrix: {
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
           int32 m = computation_.submatrices[c.arg1].matrix_index;
diff --git a/src/nnet3/nnet-compute.h b/src/nnet3/nnet-compute.h
index 9f1860c656d..333ed3168b9 100644
--- a/src/nnet3/nnet-compute.h
+++ b/src/nnet3/nnet-compute.h
@@ -164,7 +164,7 @@ class NnetComputer {
   // NULL).
   std::vector<void*> memos_;
 
-  // This is only used when commands kCompressMatrix and kUncompressMatrix are
+  // This is only used when commands kCompressMatrix and kDecompressMatrix are
   // invoked.  It will be (the first time we compress a matrix) resized to be
   // the same size as 'matrices_' (i.e., indexed by matrix index).  When we
   // compress a matrix m we set compressed_matrices_[m] to a non-NULL value and
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 027781d64ad..19ca31cf955 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -3208,7 +3208,7 @@ void ComputationExpander::ComputeCommands() {
       case kAddRowRanges:
         ExpandRowRangesCommand(c, &c_out);
         break;
-      case kCompressMatrix: case kUncompressMatrix:
+      case kCompressMatrix: case kDecompressMatrix:
       case kAcceptInput: case kProvideOutput: case kNoOperation:
       case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel: case kGotoLabel:
@@ -4469,7 +4469,7 @@ void MemoryCompressionOptimizer::ModifyComputation() {
     pairs_to_insert.push_back(p1);
     std::pair<int32, NnetComputation::Command> p2(
         info.uncompression_command_index,
-        NnetComputation::Command(1.0, kUncompressMatrix, s));
+        NnetComputation::Command(1.0, kDecompressMatrix, s));
     pairs_to_insert.push_back(p2);
   }
   InsertCommands(&pairs_to_insert,
@@ -4502,11 +4502,6 @@ void MemoryCompressionOptimizer::ProcessMatrix(int32 m) {
   std::vector<Access>::const_iterator iter = std::lower_bound(accesses.begin(),
                                                               accesses.end(),
                                                               middle_access);
-
-  if (m == 84) {
-    KALDI_LOG << "m == 84"; //TEMP
-  }
-
   // At this point, 'iter' points to the first access in 'accesses'
   // whose command index is >= 'middle_command_' (which separates the forward
   // and backward passes), or accesses.end() if this matrix was not
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 93f3cdb128f..703f43af095 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -552,7 +552,7 @@ void InsertCommands(
     NnetComputation *computation);
 
 /// Performs optimization to reduce memory usage where possible,
-/// making use of the kCompressMatrix and kUncompressMatrix commands.
+/// making use of the kCompressMatrix and kDecompressMatrix commands.
 /// Should only be done after most other optimizations, because some
 /// optimizations (such as variable-merging) would not work correctly
 /// after doing this optimization.  This does nothing for looped
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index a67f0090ef7..d614afce7d0 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -478,7 +478,7 @@ void Optimize(const NnetOptimizeOptions &config,
               const Nnet &nnet,
               int32 max_output_time_in_request,
               NnetComputation *computation) {
-  if (GetVerboseLevel() >= 1) { // TEMP, should be 3
+  if (GetVerboseLevel() >= 3) {
     CheckComputation(nnet, *computation, true);
     KALDI_LOG << "Before optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);
@@ -594,15 +594,9 @@ void Optimize(const NnetOptimizeOptions &config,
                               computation);
     if (GetVerboseLevel() >= 3)
       CheckComputation(nnet, *computation, false);
-
-    { // TEMP
-      std::ostringstream os;
-      computation->Print(os, nnet);
-      KALDI_LOG << "Compuation after adding memory compression is: " << os.str();
-    }
   }
 
-  if (GetVerboseLevel() >= 1) { // TEMP, should be 3
+  if (GetVerboseLevel() >= 3) {
     CheckComputation(nnet, *computation, false);
     KALDI_LOG << "After optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);

From 8638ec9d7b2bf503a2fbfde4eb505354bd5a52b8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 20:30:36 -0500
Subject: [PATCH 099/184] [src] Change for memory efficiency to chain training;
 small cosmetic fixes.

---
 src/chain/chain-supervision-test.cc |  2 +-
 src/chain/chain-training.cc         | 54 ++++++++++++++++-------------
 src/chain/chain-training.h          | 18 +++++-----
 src/nnet3/nnet-chain-training.cc    |  3 --
 src/nnet3/nnet-compile-looped.cc    |  1 -
 5 files changed, 41 insertions(+), 37 deletions(-)

diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index 7bf3c17854a..d14c80cd84f 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -607,8 +607,8 @@ void TestRanges() {
 int main() {
   using namespace kaldi;
   SetVerboseLevel(1);
-  int32 loop = 0;
 #if HAVE_CUDA == 1
+  int32 loop = 0;
   for (loop = 0; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 53de69a0e07..677e8f8d3dc 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -33,38 +33,44 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrixBase<BaseFloat> *xent_output_deriv) {
-  BaseFloat num_logprob_weighted;
-  if (nnet_output_deriv)
+                              CuMatrix<BaseFloat> *xent_output_deriv) {
+  BaseFloat num_logprob_weighted, den_logprob_weighted;
+  bool ok = true;
+  if (nnet_output_deriv != NULL)
     nnet_output_deriv->SetZero();
+
+  { // Doing the denominator first helps to reduce the maximum
+    // memory use, as we can set 'xent_deriv' to nonempty after
+    // we've freed the memory in this object.
+    DenominatorComputation denominator(opts, den_graph,
+                                       supervision.num_sequences,
+                                       nnet_output);
+
+    den_logprob_weighted = supervision.weight * denominator.Forward();
+    if (nnet_output_deriv)
+      ok = denominator.Backward(-supervision.weight,
+                                nnet_output_deriv);
+  }
+
+  if (xent_output_deriv != NULL)
+    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+
+
   {
     NumeratorComputation numerator(supervision, nnet_output);
     // note: supervision.weight is included as a factor in the derivative from
-    // the numerator object, and the logprob too.
+    // the numerator object, as well as the returned logprob.
     num_logprob_weighted = numerator.Forward();
-    if (nnet_output_deriv) {
-      numerator.Backward(nnet_output_deriv);
-      if (xent_output_deriv)
-        xent_output_deriv->CopyFromMat(*nnet_output_deriv);
-    } else if (xent_output_deriv) {
-      // this branch will be taken if xent_output_deriv but not
-      // nnet_output_deriv is set- which could happen if you want to compute the
-      // cross-entropy objective but not the derivatives.
-      xent_output_deriv->SetZero();
+
+    if (xent_output_deriv) {
       numerator.Backward(xent_output_deriv);
+      nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
+    } else if (nnet_output_deriv) {
+      numerator.Backward(nnet_output_deriv);
     }
   }
-  DenominatorComputation denominator(opts, den_graph,
-                                     supervision.num_sequences,
-                                     nnet_output);
-
-  BaseFloat den_logprob = denominator.Forward();
-  bool ok = true;
-  if (nnet_output_deriv)
-    ok = denominator.Backward(-supervision.weight,
-                              nnet_output_deriv);
 
-  *objf = num_logprob_weighted - supervision.weight * den_logprob;
+  *objf = num_logprob_weighted - den_logprob_weighted;
   *weight = supervision.weight * supervision.num_sequences *
       supervision.frames_per_sequence;
   if (!((*objf) - (*objf) == 0) || !ok) {
@@ -86,7 +92,7 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
   // for different frames of the sequences.  As expected, they are
   // smaller towards the edges of the sequences (due to the penalization
   // of 'incorrect' pdf-ids.
-  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL) {
+  if (GetVerboseLevel() >= 1 && nnet_output_deriv != NULL && RandInt(0, 10) == 0) {
     int32 tot_frames = nnet_output_deriv->NumRows(),
  frames_per_sequence = supervision.frames_per_sequence,
        num_sequences = supervision.num_sequences;
diff --git a/src/chain/chain-training.h b/src/chain/chain-training.h
index e6143d10846..d6535902625 100644
--- a/src/chain/chain-training.h
+++ b/src/chain/chain-training.h
@@ -63,7 +63,7 @@ struct ChainTrainingOptions {
 
   ChainTrainingOptions(): l2_regularize(0.0), leaky_hmm_coefficient(1.0e-05),
                           xent_regularize(0.0) { }
-  
+
   void Register(OptionsItf *opts) {
     opts->Register("l2-regularize", &l2_regularize, "l2 regularization "
                    "constant for 'chain' training, applied to the output "
@@ -107,10 +107,13 @@ struct ChainTrainingOptions {
                            You don't have to zero this before passing to this function,
                            we zero it internally.
    @param [out] xent_output_deriv  If non-NULL, then the numerator part of the derivative
-                           (which equals a posterior from the numerator forward-backward,
-                           scaled by the supervision weight) is written to here.  This will
-                           be used in the cross-entropy regularization code.  This value
-                           is also used in computing the cross-entropy objective value.
+                           (which equals a posterior from the numerator
+                           forward-backward, scaled by the supervision weight)
+                           is written to here (this function will set it to the
+                           correct size first; doing it this way reduces the
+                           peak memory use).  xent_output_deriv will be used in
+                           the cross-entropy regularization code; it is also
+                           used in computing the cross-entropy objective value.
 */
 void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               const DenominatorGraph &den_graph,
@@ -120,12 +123,11 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                               BaseFloat *l2_term,
                               BaseFloat *weight,
                               CuMatrixBase<BaseFloat> *nnet_output_deriv,
-                              CuMatrixBase<BaseFloat> *xent_output_deriv = NULL);
-                              
+                              CuMatrix<BaseFloat> *xent_output_deriv = NULL);
+
 
 
 }  // namespace chain
 }  // namespace kaldi
 
 #endif  // KALDI_CHAIN_CHAIN_TRAINING_H_
-
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index f2eaff7e429..16d955fb2f7 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -220,9 +220,6 @@ void NnetChainTrainer::ProcessOutputs(bool is_backstitch_step2,
     bool use_xent = (opts_.chain_config.xent_regularize != 0.0);
     std::string xent_name = sup.name + "-xent";  // typically "output-xent".
     CuMatrix<BaseFloat> xent_deriv;
-    if (use_xent)
-      xent_deriv.Resize(nnet_output.NumRows(), nnet_output.NumCols(),
-                        kUndefined);
 
     BaseFloat tot_objf, tot_l2_term, tot_weight;
 
diff --git a/src/nnet3/nnet-compile-looped.cc b/src/nnet3/nnet-compile-looped.cc
index fa8a2322e5a..1a5ceabab0e 100644
--- a/src/nnet3/nnet-compile-looped.cc
+++ b/src/nnet3/nnet-compile-looped.cc
@@ -357,7 +357,6 @@ void CreateLoopedComputationRequestSimple(const Nnet &nnet,
                                           ComputationRequest *request1,
                                           ComputationRequest *request2,
                                           ComputationRequest *request3) {
-  bool has_ivector = (nnet.InputDim("ivector") > 0);
   int32 left_context, right_context;
   ComputeSimpleNnetContext(nnet, &left_context, &right_context);
 

From 32b9f7b86b548587daccdabb2f158c37b46c65bd Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 22:16:26 -0500
Subject: [PATCH 100/184] [src] Fix to nnet-compute RE compression code.

---
 src/nnet3/nnet-compute.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 835b7fcfd88..19eecdda72b 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -382,7 +382,7 @@ void NnetComputer::ExecuteCommand() {
         }
         break;
       }
-      case kCompressMatrix: {
+      case kCompressMatrix:
         // This does nothing if CUDA is not in use.
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
@@ -399,9 +399,9 @@ void NnetComputer::ExecuteCommand() {
           compressed_matrices_[m]->CopyFromMat(matrices_[m]);
           matrices_[m].Resize(0, 0);
         }
+        break;
 #endif
-      }
-      case kDecompressMatrix: {
+      case kDecompressMatrix:
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
           int32 m = computation_.submatrices[c.arg1].matrix_index;
@@ -417,7 +417,7 @@ void NnetComputer::ExecuteCommand() {
           compressed_matrices_[m] = NULL;
         }
 #endif
-      }
+        break;
       case kNoOperation: case kNoOperationPermanent: case kNoOperationMarker:
       case kNoOperationLabel:
         break;

From c89812a7f3078654b4374269e7e21b042651d1bc Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 28 Jan 2018 23:59:58 -0500
Subject: [PATCH 101/184] [scripts] Add catch-all option 'trainer.add-option'

---
 .../libs/nnet3/train/chain_objf/acoustic_model.py     |  9 +++++----
 egs/wsj/s5/steps/libs/nnet3/train/common.py           |  5 +++++
 .../steps/libs/nnet3/train/frame_level_objf/common.py | 11 ++++++-----
 egs/wsj/s5/steps/nnet3/chain/train.py                 |  1 +
 egs/wsj/s5/steps/nnet3/train_dnn.py                   |  7 ++++---
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py               |  1 +
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py               |  1 +
 egs/wsj/s5/steps/nnet3/train_rnn.py                   |  1 +
 8 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index c63901367d6..3df2720b2c0 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -128,7 +128,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      l2_regularize, xent_regularize, leaky_hmm_coefficient,
                      momentum, max_param_change,
                      shuffle_buffer_size, num_chunk_per_minibatch_str,
-                     frame_subsampling_factor, run_opts,
+                     frame_subsampling_factor, run_opts, train_opts,
                      backstitch_training_scale=0.0, backstitch_training_interval=1):
     """
     Called from train_one_iteration(), this method trains new models
@@ -184,7 +184,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --max-param-change={max_param_change} \
                     --backstitch-training-scale={backstitch_training_scale} \
                     --backstitch-training-interval={backstitch_training_interval} \
-                    --l2-regularize-factor={l2_regularize_factor} \
+                    --l2-regularize-factor={l2_regularize_factor} {train_opts} \
                     --srand={srand} \
                     "{raw_model}" {dir}/den.fst \
                     "ark,bg:nnet3-chain-copy-egs \
@@ -201,6 +201,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                         deriv_time_opts=" ".join(deriv_time_opts),
                         app_deriv_wts=apply_deriv_weights,
                         fr_shft=frame_shift, l2=l2_regularize,
+                        train_opts=train_opts,
                         xent_reg=xent_regularize, leaky=leaky_hmm_coefficient,
                         cache_io_opts=cache_io_opts,
                         parallel_train_opts=run_opts.parallel_train_opts,
@@ -233,7 +234,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         leaky_hmm_coefficient,
                         momentum, max_param_change, shuffle_buffer_size,
                         frame_subsampling_factor,
-                        run_opts, dropout_edit_string="",
+                        run_opts, dropout_edit_string="", train_opts="",
                         backstitch_training_scale=0.0, backstitch_training_interval=1):
     """ Called from steps/nnet3/chain/train.py for one iteration for
     neural network training with LF-MMI objective
@@ -306,7 +307,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      shuffle_buffer_size=shuffle_buffer_size,
                      num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str,
                      frame_subsampling_factor=frame_subsampling_factor,
-                     run_opts=run_opts,
+                     run_opts=run_opts, train_opts=train_opts,
                      # linearly increase backstitch_training_scale during the
                      # first few iterations (hard-coded as 15)
                      backstitch_training_scale=(backstitch_training_scale *
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6c194a2c0a1..6f3e8877ae8 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -903,6 +903,11 @@ def __init__(self,
                                  lstm*=0,0.2,0'.  More general should precede
                                  less general patterns, as they are applied
                                  sequentially.""")
+        self.parser.add_argument("--trainer.add-option", type=str,
+                                 dest='train_opts', action='append',
+                                 help="""You can use this to add arbitrary options that
+                                 will be passed through to the core training code (nnet3-train
+                                 or nnet3-chain-train)""")
         self.parser.add_argument("--trainer.optimization.backstitch-training-scale",
                                  type=float, dest='backstitch_training_scale',
                                  default=0.0, help="""scale of parameters changes
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 72b776351f6..9dd12e63f52 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -33,7 +33,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                      image_augmentation_opts,
                      run_opts, frames_per_eg=-1,
                      min_deriv_time=None, max_deriv_time_relative=None,
-                     use_multitask_egs=False,
+                     use_multitask_egs=False, train_opts="",
                      backstitch_training_scale=0.0, backstitch_training_interval=1):
     """ Called from train_one_iteration(), this model does one iteration of
     training with 'num_jobs' jobs, and writes files like
@@ -144,7 +144,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                     --backstitch-training-scale={backstitch_training_scale} \
                     --l2-regularize-factor={l2_regularize_factor} \
                     --backstitch-training-interval={backstitch_training_interval} \
-                    --srand={srand} \
+                    --srand={srand} {train_opts} \
                     {deriv_time_opts} "{raw_model}" "{egs_rspecifier}" \
                     {dir}/{next_iter}.{job}.raw""".format(
                 command=run_opts.command,
@@ -159,6 +159,7 @@ def train_new_models(dir, iter, srand, num_jobs,
                 l2_regularize_factor=1.0/num_jobs,
                 backstitch_training_scale=backstitch_training_scale,
                 backstitch_training_interval=backstitch_training_interval,
+                train_opts=train_opts,
                 deriv_time_opts=" ".join(deriv_time_opts),
                 raw_model=raw_model_string,
                 egs_rspecifier=egs_rspecifier),
@@ -177,9 +178,8 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                         run_opts, image_augmentation_opts=None,
                         frames_per_eg=-1,
                         min_deriv_time=None, max_deriv_time_relative=None,
-                        shrinkage_value=1.0, dropout_edit_string="",
-                        get_raw_nnet_from_am=True,
-                        use_multitask_egs=False,
+                        shrinkage_value=1.0, dropout_edit_string="",  train_opts="",
+                        get_raw_nnet_from_am=True, use_multitask_egs=False,
                         backstitch_training_scale=0.0, backstitch_training_interval=1,
                         compute_per_dim_accuracy=False):
     """ Called from steps/nnet3/train_*.py scripts for one iteration of neural
@@ -279,6 +279,7 @@ def train_one_iteration(dir, iter, srand, egs_dir,
                      max_deriv_time_relative=max_deriv_time_relative,
                      image_augmentation_opts=image_augmentation_opts,
                      use_multitask_egs=use_multitask_egs,
+                     train_opts=train_opts,
                      backstitch_training_scale=backstitch_training_scale,
                      backstitch_training_interval=backstitch_training_interval)
 
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 011b6894938..9c90c3d6930 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -503,6 +503,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 num_chunk_per_minibatch_str=args.num_chunk_per_minibatch,
                 apply_deriv_weights=args.apply_deriv_weights,
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 073ad3e7d7a..abd803c0e14 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -328,6 +328,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 momentum=args.momentum,
@@ -365,16 +366,16 @@ def train(args, run_opts):
                 egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                 max_objective_evaluations=args.max_objective_evaluations)
-    
+
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
-        
+
         # If args.do_final_combination is true, we will use the combined model.
         # Otherwise, we will use the last_numbered model.
         real_iter = 'combined' if args.do_final_combination else num_iters
         avg_post_vec_file = train_lib.common.compute_average_posterior(
-            dir=args.dir, iter=real_iter, 
+            dir=args.dir, iter=real_iter,
             egs_dir=egs_dir, num_archives=num_archives,
             prior_subset_size=args.prior_subset_size, run_opts=run_opts)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index 2d092ceebc7..d5b37871d70 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -356,6 +356,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 minibatch_size_str=args.minibatch_size,
                 frames_per_eg=args.frames_per_eg,
                 momentum=args.momentum,
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index b51632e7d2c..686b76aa7db 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -432,6 +432,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 min_deriv_time=min_deriv_time,
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 005e751cae0..2f49c6efff3 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -410,6 +410,7 @@ def train(args, run_opts):
                     args.dropout_schedule,
                     float(num_archives_processed) / num_archives_to_process,
                     iter),
+                train_opts=' '.join(args.train_opts),
                 shrinkage_value=shrinkage_value,
                 minibatch_size_str=args.num_chunk_per_minibatch,
                 min_deriv_time=min_deriv_time,

From dbcabb8bcee6ef353141fde9c71cf45eda0b94ce Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 29 Jan 2018 00:22:59 -0500
Subject: [PATCH 102/184] [scripts] Small fix regarding --trainer.add-option
 option

---
 egs/wsj/s5/steps/libs/nnet3/train/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 6f3e8877ae8..443834fc161 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -904,7 +904,7 @@ def __init__(self,
                                  less general patterns, as they are applied
                                  sequentially.""")
         self.parser.add_argument("--trainer.add-option", type=str,
-                                 dest='train_opts', action='append',
+                                 dest='train_opts', action='append', default=[],
                                  help="""You can use this to add arbitrary options that
                                  will be passed through to the core training code (nnet3-train
                                  or nnet3-chain-train)""")

From cf745735ec88da8fd9af74103685d6560dba9d4a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 29 Jan 2018 20:39:58 -0500
Subject: [PATCH 103/184] [src] Various bug-fixes relating to recent
 nnet3/chain changes.

---
 src/chain/chain-training.cc      |  3 ++-
 src/nnet3/nnet-analyze.cc        | 17 ++++++++++-------
 src/nnet3/nnet-optimize-utils.cc | 18 ++++++++++++------
 src/nnet3/nnet-optimize.h        |  7 ++++---
 4 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index 677e8f8d3dc..bf61bed67f0 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -64,7 +64,8 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
 
     if (xent_output_deriv) {
       numerator.Backward(xent_output_deriv);
-      nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
+      if (nnet_output_deriv)
+        nnet_output_deriv->AddMat(1.0, *xent_output_deriv);
     } else if (nnet_output_deriv) {
       numerator.Backward(nnet_output_deriv);
     }
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index cf48d3d86c6..ec1d3fa0f2e 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -634,21 +634,24 @@ void ComputationChecker::CheckComputationRewrite() const {
    Checks for the situation where a variable is read before being written.
 */
 void ComputationChecker::CheckComputationUndefined() const {
+  // the variable 'min_proportion' needs to be <= the min_proportion_ value in
+  // class MatrixExtender, otherwise this code could spuriously reject a
+  // computation.
+  BaseFloat min_proportion = 0.8;
+
   int32 num_variables = a_.variable_accesses.size();
   for (int32 v = 0; v < num_variables; v++) {
     const std::vector<Access> &accesses = a_.variable_accesses[v];
     if (accesses.empty()) {
       if (config_.check_unused_variables) {
-        // Before we throw an error, we want to check that it isn't
-        // a case that can be produced by the ExtendMatrices()
-        // optimization, that is actually allowed.  This is a case
-        // when a variable is the last few rows of a matrix, but
-        // not all columns of those last rows.
         NnetComputation::SubMatrixInfo info = a_.variables.VariableInfo(v);
         const NnetComputation::MatrixInfo &matrix_info =
             computation_.matrices[info.matrix_index];
-        if (info.row_offset > 0 &&
-            info.num_rows + info.row_offset == matrix_info.num_rows &&
+        // Before we throw an error, we want to check that it isn't a case that
+        // can be produced by the ExtendMatrices() optimization, that is
+        // actually allowed.  This is a case when a variable is inside the last
+        // few rows of a matrix, but not all columns of those last rows.
+        if (info.row_offset >= min_proportion * matrix_info.num_rows &&
             !(info.col_offset == 0 && info.num_cols == matrix_info.num_cols)) {
           continue;
         }
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 19ca31cf955..c53fba815fb 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -1064,6 +1064,9 @@ class MatrixExtender {
   std::vector<bool> is_input_or_output_;
 };
 
+// note: the initializer for min_proportion_ below needs to be kept in sync with
+// the min_proportion variable in
+// ComputationChecker::CheckComputationUndefined() in nnet-analyze.cc.
 MatrixExtender::MatrixExtender(NnetComputation *computation):
     min_proportion_(0.8),
     computation_(computation) {
@@ -1111,9 +1114,10 @@ bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index,
   const MatrixInfo
       &src_matrix = computation_->matrices[src_submatrix.matrix_index];
 
-  int32 dest_matrix_num_rows = orig_num_rows_[dest_submatrix.matrix_index];
+  int32 dest_matrix_orig_num_rows = orig_num_rows_[dest_submatrix.matrix_index],
+      src_matrix_orig_num_rows = orig_num_rows_[src_submatrix.matrix_index];
 
-  if (src_submatrix.num_rows < min_proportion_ * src_matrix.num_rows)
+  if (src_submatrix.num_rows < min_proportion_ * src_matrix_orig_num_rows)
     return false;
 
   // The following checks that the source submatrix covers be all of the
@@ -1124,7 +1128,7 @@ bool MatrixExtender::CanBeExtended(int32 dest_submatrix_index,
           src_submatrix.row_offset == 0 &&
           src_submatrix.num_rows < src_matrix.num_rows &&
           dest_submatrix.row_offset + dest_submatrix.num_rows ==
-          dest_matrix_num_rows);
+          dest_matrix_orig_num_rows);
 }
 
 
@@ -4614,9 +4618,11 @@ void OptimizeMemoryCompression(const Nnet &nnet,
 
     if (GetVerboseLevel() >= 2) {
       bytes_used_final = GetMaxMemoryUse(*computation);
-      KALDI_VLOG(2) << "Memory compression reduced  memory use from "
-                    << bytes_used_initial << " to "
-                    << bytes_used_final << " bytes.";
+      if (bytes_used_final != bytes_used_initial) {
+        KALDI_VLOG(2) << "Memory compression reduced  memory use from "
+                      << bytes_used_initial << " to "
+                      << bytes_used_final << " bytes.";
+      }
     }
   }
 }
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index cf308dd3b00..31872e46b72 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -75,7 +75,7 @@ struct NnetOptimizeOptions {
       max_deriv_time(std::numeric_limits<int32>::max()),
       max_deriv_time_relative(std::numeric_limits<int32>::max()),
       snip_row_ops(true),
-      memory_compression_level(0),
+      memory_compression_level(1),
       optimize_looped_computation(false) { }
 
   void Register(OptionsItf *opts) {
@@ -133,10 +133,11 @@ struct NnetOptimizeOptions {
                    "per-row operations");
     opts->Register("memory-compression-level", &memory_compression_level,
                    "This is only relevant to training, not decoding.  Set this "
-                   "to 0,1,2,3; higher levels are more aggressive at reducing "
+                   "to 0,1,2; higher levels are more aggressive at reducing "
                    "memory by compressing quantities needed for backprop, "
                    "potentially at the expense of speed and the accuracy "
-                   "of derivatives.  0 means no compression at all.");
+                   "of derivatives.  0 means no compression at all; 1 means "
+                   "compression that shouldn't affect results at all.");
 
   }
   void Read(std::istream &is, bool binary);

From fe18a16e3540e1530900b7d4382bf4423fa949aa Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 29 Jan 2018 23:53:09 -0500
Subject: [PATCH 104/184] [egs] Add new example script

---
 .../s5c/local/chain/tuning/run_tdnn_7m23t.sh  | 542 ++++++++++++++++++
 1 file changed, 542 insertions(+)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
new file mode 100755
index 00000000000..f912b2d1175
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
@@ -0,0 +1,542 @@
+#!/bin/bash
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m23t
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn4l, tdnn2l)
+  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;

From e14c4b41eb7e40d755e3fd8d4e96aa63183830fb Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 30 Jan 2018 12:04:44 -0500
Subject: [PATCH 105/184] Add phone-set compatibility checks for nnet3 models

---
 egs/wsj/s5/steps/nnet3/chain/train.py         |  4 ++++
 egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh    |  3 +++
 egs/wsj/s5/steps/nnet3/decode.sh              |  7 +++++++
 egs/wsj/s5/steps/nnet3/train_dnn.py           | 10 +++++++---
 egs/wsj/s5/steps/nnet3/train_rnn.py           |  4 ++++
 egs/wsj/s5/steps/nnet3/train_tdnn.sh          |  3 +++
 .../s5/utils/lang/check_phones_compatible.sh  | 19 ++++++++-----------
 7 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6896da67f73..6dcd674bac0 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -274,6 +274,10 @@ def train(args, run_opts):
     chain_lib.check_for_required_files(args.feat_dir, args.tree_dir,
                                        args.lat_dir)
 
+    # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
+    # need it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)
+
     # Set some variables.
     num_jobs = common_lib.get_number_of_jobs(args.tree_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 4ba8cae2d56..8eabe9c33e6 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -158,6 +158,9 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
+# Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
+# need it to check compatibility between training and decoding phone-sets.
+cp $treedir/phones.txt $dir
 
 # Set some variables.
 nj=`cat $treedir/num_jobs` || exit 1;  # number of jobs in alignment dir...
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 50e02629db0..27256ca5964 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -70,6 +70,13 @@ if [ ! -z "$online_ivector_dir" ]; then
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 fi
 
+if [ ! -f $srcdir/phones.txt ]; then
+  echo >&2 "$0: WARNING: The model directory '$srcdir' does not contain phones.txt."
+  echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping the"
+  echo >&2 "$0: WARNING: phone-sets compatible between the trained model and the decoding graph."
+fi
+utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
+
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index 073ad3e7d7a..ccd9c82b622 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -162,6 +162,10 @@ def train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+    # need it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
+
     # Set some variables.
     # num_leaves = common_lib.get_number_of_leaves_from_tree(args.ali_dir)
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
@@ -365,16 +369,16 @@ def train(args, run_opts):
                 egs_dir=egs_dir,
                 minibatch_size_str=args.minibatch_size, run_opts=run_opts,
                 max_objective_evaluations=args.max_objective_evaluations)
-    
+
     if args.stage <= num_iters + 1:
         logger.info("Getting average posterior for purposes of "
                     "adjusting the priors.")
-        
+
         # If args.do_final_combination is true, we will use the combined model.
         # Otherwise, we will use the last_numbered model.
         real_iter = 'combined' if args.do_final_combination else num_iters
         avg_post_vec_file = train_lib.common.compute_average_posterior(
-            dir=args.dir, iter=real_iter, 
+            dir=args.dir, iter=real_iter,
             egs_dir=egs_dir, num_archives=num_archives,
             prior_subset_size=args.prior_subset_size, run_opts=run_opts)
 
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 005e751cae0..3e0f03f7de1 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -218,6 +218,10 @@ def train(args, run_opts):
     arg_string = pprint.pformat(vars(args))
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
+    # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+    # need it to check compatibility between training and decoding phone-sets.
+    shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
+
     # Set some variables.
     num_jobs = common_lib.get_number_of_jobs(args.ali_dir)
     feat_dim = common_lib.get_feat_dim(args.feat_dir)
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index fbcf426b205..6537c7c659f 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -148,6 +148,9 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
   [ ! -f $f ] && echo "$0: no such file $f" && exit 1;
 done
 
+# Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
+# need it to check compatibility between training and decoding phone-sets.
+cp $alidir/phones.txt $dir
 
 # Set some variables.
 num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1
diff --git a/egs/wsj/s5/utils/lang/check_phones_compatible.sh b/egs/wsj/s5/utils/lang/check_phones_compatible.sh
index 18301a900c5..cfad06d2b8c 100755
--- a/egs/wsj/s5/utils/lang/check_phones_compatible.sh
+++ b/egs/wsj/s5/utils/lang/check_phones_compatible.sh
@@ -18,11 +18,8 @@
 # except for possible differences in disambiguation symbols (meaning that all
 # symbols except those beginning with a # are mapped to the same values).
 # Otherwise it prints a warning and exits with status 1.
-# For the sake of compatibility with other scripts that did not write the 
-# phones.txt to model directories, this script exits silently with status 0 
-# if one of the phone symbol tables does not exist.
-# For the sake of compatibility with other scripts that did not write the 
-# phones.txt to model directories, this script exits silently with status 0 
+# For the sake of compatibility with other scripts that did not write the
+# phones.txt to model directories, this script exits silently with status 0
 # if one of the phone symbol tables does not exist.
 
 . utils/parse_options.sh || exit 1;
@@ -36,24 +33,24 @@ fi
 table_first=$1
 table_second=$2
 
-# check the files exist or not 
+# check if the files exist or not
 if [ ! -f $table_first ]; then
   if [ ! -f $table_second ]; then
     echo "$0: Error! Both of the two phones-symbol tables are absent."
     echo "Please check your command"
     exit 1;
   else
-    #The phones-symbol-table1 is absent. The model directory maybe created by old script.
-    #For back compatibility, this script exits silently with status 0.
+    # The phones-symbol-table1 is absent. The model directory maybe created by old script.
+    # For back compatibility, this script exits silently with status 0.
     exit 0;
   fi
 elif [ ! -f $table_second ]; then
-  #The phones-symbol-table2 is absent. The model directory maybe created by old script.
-  #For back compatibility, this script exits silently with status 0.
+  # The phones-symbol-table2 is absent. The model directory maybe created by old script.
+  # For back compatibility, this script exits silently with status 0.
   exit 0;
 fi
 
-#Check the two tables are same or not (except for possible difference in disambiguation symbols).
+# Check if the two tables are the same (except for possible difference in disambiguation symbols).
 if ! cmp -s <(grep -v "^#" $table_first) <(grep -v "^#" $table_second); then
   echo "$0: phone symbol tables $table_first and $table_second are not compatible."
   exit 1;

From e4fc87d149e575c1eddc0dcd3412b82c01362d45 Mon Sep 17 00:00:00 2001
From: Karel Vesely <vesis84@gmail.com>
Date: Tue, 30 Jan 2018 20:32:48 +0100
Subject: [PATCH 106/184] [scripts] bugfix for
 'steps/cleanup/clean_and_segment_data.sh', (#2196)

- make sure that lattice-generation, and subsequent search
  of 'oracle-transcript' in them uses the same data-split.
---
 egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
index 29d52588807..4a9d43a51b5 100755
--- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
+++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -72,9 +72,9 @@ fi
 nj=$(cat $latdir/num_jobs)
 oov=$(cat $lang/oov.int)
 
-utils/split_data.sh --per-utt $data $nj
+utils/split_data.sh $data $nj
 
-sdata=$data/split${nj}utt
+sdata=$data/split$nj;
 
 if [ $stage -le 1 ]; then
   $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \

From 7bfafa46f2d52dec0d12fdad649c69e0a725c4b2 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 30 Jan 2018 18:38:57 -0500
Subject: [PATCH 107/184] Some changes in the previous commit

---
 egs/wsj/s5/steps/nnet3/chain/train.py      | 2 +-
 egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh | 2 +-
 egs/wsj/s5/steps/nnet3/decode.sh           | 5 -----
 egs/wsj/s5/steps/nnet3/train_dnn.py        | 2 +-
 egs/wsj/s5/steps/nnet3/train_rnn.py        | 2 +-
 egs/wsj/s5/steps/nnet3/train_tdnn.sh       | 2 +-
 6 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 6dcd674bac0..82ea2771048 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -275,7 +275,7 @@ def train(args, run_opts):
                                        args.lat_dir)
 
     # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
-    # need it to check compatibility between training and decoding phone-sets.
+    # use it to check compatibility between training and decoding phone-sets.
     shutil.copy('{0}/phones.txt'.format(args.tree_dir), args.dir)
 
     # Set some variables.
diff --git a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
index 8eabe9c33e6..f5340fb4611 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/chain/train_tdnn.sh
@@ -159,7 +159,7 @@ for f in $data/feats.scp $treedir/ali.1.gz $treedir/final.mdl $treedir/tree \
 done
 
 # Copy phones.txt from tree-dir to dir. Later, steps/nnet3/decode.sh will
-# need it to check compatibility between training and decoding phone-sets.
+# use it to check compatibility between training and decoding phone-sets.
 cp $treedir/phones.txt $dir
 
 # Set some variables.
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 27256ca5964..8c520e0b5e1 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -70,11 +70,6 @@ if [ ! -z "$online_ivector_dir" ]; then
   extra_files="$online_ivector_dir/ivector_online.scp $online_ivector_dir/ivector_period"
 fi
 
-if [ ! -f $srcdir/phones.txt ]; then
-  echo >&2 "$0: WARNING: The model directory '$srcdir' does not contain phones.txt."
-  echo >&2 "$0: WARNING: That means it's you who's reponsible for keeping the"
-  echo >&2 "$0: WARNING: phone-sets compatible between the trained model and the decoding graph."
-fi
 utils/lang/check_phones_compatible.sh {$srcdir,$graphdir}/phones.txt || exit 1
 
 for f in $graphdir/HCLG.fst $data/feats.scp $model $extra_files; do
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index ccd9c82b622..0fe0e4ef445 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -163,7 +163,7 @@ def train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
-    # need it to check compatibility between training and decoding phone-sets.
+    # use it to check compatibility between training and decoding phone-sets.
     shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
 
     # Set some variables.
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 3e0f03f7de1..78d4eb98d16 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -219,7 +219,7 @@ def train(args, run_opts):
     logger.info("Arguments for the experiment\n{0}".format(arg_string))
 
     # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
-    # need it to check compatibility between training and decoding phone-sets.
+    # use it to check compatibility between training and decoding phone-sets.
     shutil.copy('{0}/phones.txt'.format(args.ali_dir), args.dir)
 
     # Set some variables.
diff --git a/egs/wsj/s5/steps/nnet3/train_tdnn.sh b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
index 6537c7c659f..f023d38b26c 100755
--- a/egs/wsj/s5/steps/nnet3/train_tdnn.sh
+++ b/egs/wsj/s5/steps/nnet3/train_tdnn.sh
@@ -149,7 +149,7 @@ for f in $data/feats.scp $lang/L.fst $alidir/ali.1.gz $alidir/final.mdl $alidir/
 done
 
 # Copy phones.txt from ali-dir to dir. Later, steps/nnet3/decode.sh will
-# need it to check compatibility between training and decoding phone-sets.
+# use it to check compatibility between training and decoding phone-sets.
 cp $alidir/phones.txt $dir
 
 # Set some variables.

From 1647856ee45282bf0e20dbd49c7505a9a8c36d4b Mon Sep 17 00:00:00 2001
From: Pavel Denisov <pavel.a.denisov@gmail.com>
Date: Wed, 31 Jan 2018 23:25:12 +0100
Subject: [PATCH 108/184] [egs] Small bug-fix in Librispeech recipe  (#2190)

---
 egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
index 8546955c93c..cd26773f50f 100755
--- a/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
+++ b/egs/librispeech/s5/local/chain/tuning/run_tdnn_1b.sh
@@ -173,8 +173,6 @@ if [ $stage -le 15 ]; then
      /export/b0{5,6,7,8}/$USER/kaldi-data/egs/librispeech-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
   fi
 
-  touch $dir/egs/.nodelete # keep egs around when that run dies.
-
   steps/nnet3/chain/train.py --stage $train_stage \
     --cmd "$decode_cmd" \
     --feat.online-ivector-dir $train_ivector_dir \

From 2de3b38666a69774f4896bc96a2b62e3f62c63fe Mon Sep 17 00:00:00 2001
From: Daniel Galvez <galv@users.noreply.github.com>
Date: Wed, 31 Jan 2018 20:22:24 -0800
Subject: [PATCH 109/184] [src] Make arpa2fst robust against ARPA files without
 <s>. (#2167)

---
 .gitignore                        |  1 +
 src/lm/arpa-lm-compiler-test.cc   | 14 ++++++++++++++
 src/lm/arpa-lm-compiler.cc        |  8 ++++++++
 src/lm/arpa-lm-compiler.h         |  1 +
 src/lm/test_data/missing_bos.arpa | 18 ++++++++++++++++++
 5 files changed, 42 insertions(+)
 create mode 100644 src/lm/test_data/missing_bos.arpa

diff --git a/.gitignore b/.gitignore
index 940a571d2ca..bd6410c4aab 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 # Compiled extensionless executable files in /src/*/
 # This stanza must precede wildcard patterns below!
 /src/*/*
+!/src/lm/test_data/
 !/src/*/?*.*
 !/src/doc/*
 !/src/*/Makefile
diff --git a/src/lm/arpa-lm-compiler-test.cc b/src/lm/arpa-lm-compiler-test.cc
index 21239afb475..697d70c416a 100644
--- a/src/lm/arpa-lm-compiler-test.cc
+++ b/src/lm/arpa-lm-compiler-test.cc
@@ -204,6 +204,17 @@ bool ScoringTest(bool seps, const string &infile, const string& sentence,
   return ok;
 }
 
+bool ThrowsExceptionTest(bool seps, const string &infile) {
+  try {
+    // Make memory cleanup easy in both cases of try-catch block.
+    std::unique_ptr<ArpaLmCompiler> compiler(Compile(seps, infile));
+    return false;
+  } catch (const std::runtime_error&) {
+    // Kaldi throws only std::runtime_error in kaldi-error.cc
+    return true;
+  }
+}
+
 }  // namespace kaldi
 
 bool RunAllTests(bool seps) {
@@ -214,6 +225,9 @@ bool RunAllTests(bool seps) {
 
   ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "b b b a", 59.2649);
   ok &= kaldi::ScoringTest(seps, "test_data/input.arpa", "a b", 4.36082);
+
+  ok &= kaldi::ThrowsExceptionTest(seps, "test_data/missing_bos.arpa");
+
   if (!ok) {
     KALDI_WARN << "Tests " << (seps ? "with" : "without")
                << " epsilon substitution FAILED";
diff --git a/src/lm/arpa-lm-compiler.cc b/src/lm/arpa-lm-compiler.cc
index c854b077d00..d774deeb783 100644
--- a/src/lm/arpa-lm-compiler.cc
+++ b/src/lm/arpa-lm-compiler.cc
@@ -360,10 +360,18 @@ void ArpaLmCompiler::RemoveRedundantStates() {
             << fst_.NumStates();
 }
 
+void ArpaLmCompiler::Check() const {
+  if (fst_.Start() == fst::kNoStateId) {
+    KALDI_ERR << "Arpa file did not contain the beginning-of-sentence symbol "
+              << Symbols()->Find(Options().bos_symbol) << ".";
+  }
+}
+
 void ArpaLmCompiler::ReadComplete() {
   fst_.SetInputSymbols(Symbols());
   fst_.SetOutputSymbols(Symbols());
   RemoveRedundantStates();
+  Check();
 }
 
 }  // namespace kaldi
diff --git a/src/lm/arpa-lm-compiler.h b/src/lm/arpa-lm-compiler.h
index 35fd52d6cf3..3e3baeb6ee1 100644
--- a/src/lm/arpa-lm-compiler.h
+++ b/src/lm/arpa-lm-compiler.h
@@ -52,6 +52,7 @@ class ArpaLmCompiler : public ArpaFileParser {
   // this function removes states that only have a backoff arc coming
   // out of them.
   void RemoveRedundantStates();
+  void Check() const;
 
   int sub_eps_;
   ArpaLmCompilerImplInterface* impl_;  // Owned.
diff --git a/src/lm/test_data/missing_bos.arpa b/src/lm/test_data/missing_bos.arpa
new file mode 100644
index 00000000000..487061a49a4
--- /dev/null
+++ b/src/lm/test_data/missing_bos.arpa
@@ -0,0 +1,18 @@
+
+\data\
+ngram 1=3
+ngram 2=1
+ngram 3=1
+
+\1-grams:
+-5.234679	a -3.3
+-3.456783	b -3.0
+-4.333333	</s>
+
+\2-grams:
+-1.45678	a b -3.23
+
+\3-grams:
+-0.23940	a b </s>
+
+\end\

From c82560ddb2add09878c20df6d4b04c7f23f8c010 Mon Sep 17 00:00:00 2001
From: Matthew Maciejewski <mmaciej2@jhu.edu>
Date: Thu, 1 Feb 2018 18:11:44 -0500
Subject: [PATCH 110/184] [scripts] Fixed small issue
 get_uniform_subsegments.py (RE rounding) (#2200)

---
 egs/wsj/s5/utils/data/get_uniform_subsegments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/utils/data/get_uniform_subsegments.py b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
index adf042f3d4d..c61b96e0dbb 100755
--- a/egs/wsj/s5/utils/data/get_uniform_subsegments.py
+++ b/egs/wsj/s5/utils/data/get_uniform_subsegments.py
@@ -87,8 +87,8 @@ def run(args):
         else:
             end = end_time
         new_utt = "{utt_id}-{s:08d}-{e:08d}".format(
-            utt_id=utt_id, s=int(100 * (start - start_time)),
-            e=int(100 * (end - start_time)))
+            utt_id=utt_id, s=int(round(100 * (start - start_time))),
+            e=int(round(100 * (end - start_time))))
         print ("{new_utt} {utt_id} {s} {e}".format(
             new_utt=new_utt, utt_id=utt_id, s=start - start_time,
             e=end - start_time))

From b4fbe00b0b827d7b9f914ab6b82b29a903a16144 Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Fri, 2 Feb 2018 22:48:53 -0500
Subject: [PATCH 111/184] [egs] Add assert to check
 --backstitch-training-interval option (#2203) (#2204)

---
 src/nnet3/nnet-chain-training.cc | 3 ++-
 src/nnet3/nnet-training.cc       | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 780a7115a8a..3e6d8599382 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -37,7 +37,8 @@ NnetChainTrainer::NnetChainTrainer(const NnetChainTrainingOptions &opts,
   if (opts.nnet_config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(opts.nnet_config.momentum >= 0.0 &&
-               opts.nnet_config.max_param_change >= 0.0);
+               opts.nnet_config.max_param_change >= 0.0 &&
+               opts.nnet_config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 30cd47b3eb2..a9093523222 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -34,7 +34,8 @@ NnetTrainer::NnetTrainer(const NnetTrainerOptions &config,
   if (config.zero_component_stats)
     ZeroComponentStats(nnet);
   KALDI_ASSERT(config.momentum >= 0.0 &&
-               config.max_param_change >= 0.0);
+               config.max_param_change >= 0.0 &&
+               config.backstitch_training_interval > 0);
   delta_nnet_ = nnet_->Copy();
   ScaleNnet(0.0, delta_nnet_);
   const int32 num_updatable = NumUpdatableComponents(*delta_nnet_);

From 9e2d8442bfee9fca88c1fad4f138fcaef0ac1e3f Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 4 Feb 2018 05:22:04 +0330
Subject: [PATCH 112/184] [egs,scripts] Fix and simplify speed-perturbation
 scripts; fix permissions. (#2205)

---
 .../s5/local/chain/compare_wer_general.sh     |  0
 .../s5/local/chain/run_blstm_6h.sh            |  0
 .../s5/local/chain/run_blstm_6j.sh            |  0
 egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh |  0
 .../s5/local/chain/run_tdnn_lstm_1a.sh        |  0
 .../s5/local/chain/run_tdnn_lstm_1b.sh        |  0
 .../s5/local/chain/run_tdnn_opgru_1a.sh       |  0
 .../s5/local/chain/run_tdnn_opgru_1b.sh       |  0
 .../s5c/local/nnet3/run_ivector_common.sh     | 45 +++++++------------
 .../utils/data/perturb_data_dir_speed_3way.sh | 12 ++++-
 10 files changed, 27 insertions(+), 30 deletions(-)
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/compare_wer_general.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
 mode change 100644 => 100755 egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh

diff --git a/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh b/egs/fisher_swbd/s5/local/chain/compare_wer_general.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6h.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh b/egs/fisher_swbd/s5/local/chain/run_blstm_6j.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_7b.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1a.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_lstm_1b.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1a.sh
old mode 100644
new mode 100755
diff --git a/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh b/egs/fisher_swbd/s5/local/chain/run_tdnn_opgru_1b.sh
old mode 100644
new mode 100755
diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index 5e132830cfb..d45095ec85b 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -4,49 +4,38 @@
 set -e
 stage=1
 train_stage=-10
-generate_alignments=true # false if doing ctc training
+generate_alignments=true
 speed_perturb=true
 
 . ./path.sh
 . ./utils/parse_options.sh
 
-mkdir -p nnet3
-# perturbed data preparation
+mkdir -p exp/nnet3
 train_set=train_nodup
 
 if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
-if [ "$speed_perturb" == "true" ]; then
+if $speed_perturb; then
   if [ $stage -le 1 ]; then
-    #Although the nnet will be trained by high resolution data, we still have to perturbe the normal data to get the alignment
+    # Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignments
     # _sp stands for speed-perturbed
-
-    for datadir in train_nodup; do
-      utils/perturb_data_dir_speed.sh 0.9 data/${datadir} data/temp1
-      utils/perturb_data_dir_speed.sh 1.1 data/${datadir} data/temp2
-      utils/combine_data.sh data/${datadir}_tmp data/temp1 data/temp2
-      utils/validate_data_dir.sh --no-feats data/${datadir}_tmp
-      rm -r data/temp1 data/temp2
-
-      mfccdir=mfcc_perturbed
-      steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
-        data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      steps/compute_cmvn_stats.sh data/${datadir}_tmp exp/make_mfcc/${datadir}_tmp $mfccdir || exit 1;
-      utils/fix_data_dir.sh data/${datadir}_tmp
-
-      utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- data/${datadir} data/temp0
-      utils/combine_data.sh data/${datadir}_sp data/${datadir}_tmp data/temp0
-      utils/fix_data_dir.sh data/${datadir}_sp
-      rm -r data/temp0 data/${datadir}_tmp
-    done
+    echo "$0: preparing directory for speed-perturbed data"
+    utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+
+    echo "$0: creating MFCC features for low-resolution speed-perturbed data"
+    mfccdir=mfcc_perturbed
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 50 \
+                       data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir
+    steps/compute_cmvn_stats.sh data/${train_set}_sp exp/make_mfcc/${train_set}_sp $mfccdir
+    utils/fix_data_dir.sh data/${train_set}_sp
   fi
 
-  if [ $stage -le 2 ] && [ "$generate_alignments" == "true" ]; then
-    #obtain the alignment of the perturbed data
+  if [ $stage -le 2 ] && $generate_alignments; then
+    # obtain the alignment of the perturbed data
     steps/align_fmllr.sh --nj 100 --cmd "$train_cmd" \
-      data/train_nodup_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp || exit 1
+      data/${train_set}_sp data/lang exp/tri4 exp/tri4_ali_nodup_sp
   fi
-  train_set=train_nodup_sp
+  train_set=${train_set}_sp
 fi
 
 if [ $stage -le 3 ]; then
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
index 5b007cadb3f..048220d62fd 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
@@ -38,9 +38,17 @@ utils/data/get_utt2dur.sh ${srcdir}
 
 utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
-utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
 
-rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
+utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0
+if [ ! -f $srcdir/utt2uniq ]; then
+  cat $srcdir/utt2spk | awk  '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq
+else
+  cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq
+fi
+
+utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+
+rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0
 
 echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
 utils/validate_data_dir.sh --no-feats --no-text $destdir

From 79065901b7d4a58d757dcba4fcdac89a374caff2 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 6 Feb 2018 16:29:56 -0500
Subject: [PATCH 113/184] [doc] remove outdated TODOs.  Thx: David Van Leeuwen.

---
 src/doc/chain.dox | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/src/doc/chain.dox b/src/doc/chain.dox
index 687d9a2d7a7..ca2efce3627 100644
--- a/src/doc/chain.dox
+++ b/src/doc/chain.dox
@@ -389,34 +389,7 @@ on the paths.
  You might notice in the current example scripts that we use iVectors.  We do so
  just because they generally help a bit, and because the baseline setup we were
  comparing with, uses them.  There is no inherent connection with 'chain'
- models, and no fundamental requirement to use them.  Actually we want to get rid
- of them (see below).
-
-
- \section chain_next_steps  Next steps (TODOs) with 'chain' models
-
-  (Note: this list is valid as of Dec 13 2015, but may become out of date).
-   Things we need to do (and that we'd like help with) are:
-     - Supply example scripts (and tune them) on a wide range of corpora
-       (It will be interesting to see whether there are scale-dependent effects
-       affecting how well this model works).
-     - Create and tune LSTM and BLSTM versions of the training script.  (This
-       may involve some playing around with learning rate schedules and
-       configurations).
-     - Figure out how to speed up the forward-backward part of the computation.
-       (E.g. using state-level pruning, or just by optimizing the current kernels or
-       data structures).
-
-  A longer-term TODO, which Dan should do, is to create an online decoding setup
-  for these models.  Actually this isn't really distinct from nnet3 online
-  decoding in general, since the models are no different from regular nnet3
-  acoustic models.  But we do have to decide whether to continue to support
-  iVectors-- getting rid of them would simplify the setup considerably, and
-  would hopefully make it more robust.  We are hoping that with LSTMs, since it
-  already sees quite a wide acoustic context, iVector adaptation will no longer
-  be as helpful and could be dropped.  We also have other ideas how to
-  incorporate adaptation as part of the neural network, without the use of
-  iVectors.  This will require some experimentation.
+ models, and no fundamental requirement to use them.
 
 
 */

From 027f1d79611395183af184d50e1dc82552480203 Mon Sep 17 00:00:00 2001
From: Kuang R <rukuang10@gmail.com>
Date: Wed, 7 Feb 2018 14:49:55 +0800
Subject: [PATCH 114/184] [scripts] Add missing import statement in script
 (#2207)

---
 egs/wsj/s5/steps/libs/common.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py
index b84fa46f622..1e8e2ced6ce 100644
--- a/egs/wsj/s5/steps/libs/common.py
+++ b/egs/wsj/s5/steps/libs/common.py
@@ -14,7 +14,6 @@
 import logging
 import math
 import os
-import re
 import subprocess
 import sys
 import threading

From 8e170e039c102a7b52cc4047296c21b6f3055227 Mon Sep 17 00:00:00 2001
From: Xiaohui Zhang <samuelzhang1104@gmail.com>
Date: Wed, 7 Feb 2018 16:31:17 -0500
Subject: [PATCH 115/184] =?UTF-8?q?[egs]=20multi=5Fen:=20Fixed=20acronym?=
 =?UTF-8?q?=20normalization,=20swbd=20lexicon=20preparation,=20OOV=20?=
 =?UTF-8?q?=E2=80=A6=20(#2137)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* multi_en: Fixed acronym normalization, swbd lexicon preparation, OOV pronunciation generation, acoustic data sub-sampling,.etc; Added hub4_97 data
---
 egs/multi_en/s5/README.md                     |   2 +-
 egs/multi_en/s5/RESULTS                       |  39 +--
 egs/multi_en/s5/conf/mfcc.conf                |   1 +
 egs/multi_en/s5/local/g2p/apply_g2p.sh        |   2 +-
 egs/multi_en/s5/local/hub4_96_data_prep.sh    |  52 ++++
 egs/multi_en/s5/local/hub4_96_parse_sgm.pl    | 235 +++++++++++++++++
 egs/multi_en/s5/local/hub4_97_data_prep.sh    |  50 ++++
 egs/multi_en/s5/local/hub4_97_parse_sgm.pl    | 235 +++++++++++++++++
 egs/multi_en/s5/local/hub4_data_prep.py       | 242 ------------------
 egs/multi_en/s5/local/hub4_en_data_prep.sh    |  62 +++++
 egs/multi_en/s5/local/hub4_format_data.pl     | 138 ++++++++++
 .../local/hub4_normalize_bn96_transcripts.pl  |  33 +++
 .../local/hub4_normalize_bn97_transcripts.pl  |  42 +++
 egs/multi_en/s5/local/hub4_utils.py           | 174 -------------
 .../s5/local/librispeech_data_prep.sh         |  13 +-
 egs/multi_en/s5/local/make_partitions.sh      |   9 +-
 egs/multi_en/s5/local/swbd1_data_prep.sh      |  10 +-
 egs/multi_en/s5/local/tedlium_prepare_data.sh |   3 +-
 egs/multi_en/s5/local/wsj_data_prep.sh        |   3 +-
 egs/multi_en/s5/run.sh                        | 117 ++++-----
 egs/swbd/s5c/local/score_basic.sh             |   3 +-
 egs/wsj/s5/utils/data/resample_data_dir.sh    |   7 +
 22 files changed, 943 insertions(+), 529 deletions(-)
 mode change 100644 => 100755 egs/multi_en/s5/RESULTS
 create mode 100755 egs/multi_en/s5/local/hub4_96_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_96_parse_sgm.pl
 create mode 100755 egs/multi_en/s5/local/hub4_97_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_97_parse_sgm.pl
 delete mode 100755 egs/multi_en/s5/local/hub4_data_prep.py
 create mode 100755 egs/multi_en/s5/local/hub4_en_data_prep.sh
 create mode 100755 egs/multi_en/s5/local/hub4_format_data.pl
 create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
 create mode 100755 egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
 delete mode 100644 egs/multi_en/s5/local/hub4_utils.py

diff --git a/egs/multi_en/s5/README.md b/egs/multi_en/s5/README.md
index 0affcb9cf08..20505c5af6f 100755
--- a/egs/multi_en/s5/README.md
+++ b/egs/multi_en/s5/README.md
@@ -2,7 +2,7 @@ This is a WIP **English LVCSR recipe** that trains on data from multiple corpora
 * Fisher (1761 hours)
 * Switchboard (317 hours)
 * WSJ (81 hours)
-* HUB4 English Broadcast News (76 hours)
+* HUB4 (1996 & 1997) English Broadcast News (75 + 72 hours)
 * TED-LIUM (118 hours)
 * Librispeech (960 hours)
 
diff --git a/egs/multi_en/s5/RESULTS b/egs/multi_en/s5/RESULTS
old mode 100644
new mode 100755
index 24b82755b94..17eb49c3740
--- a/egs/multi_en/s5/RESULTS
+++ b/egs/multi_en/s5/RESULTS
@@ -36,21 +36,24 @@ exit 0
 # multi_a  tri5  tedlium_tg_tedlium.si    ||  %WER 29.0 | 1155 27512 | 75.8 20.3 3.9 4.8 29.0 93.3 | exp/multi_a/tri5/decode_tedlium_tg_tedlium.si/score_11_0.5/test.ctm.filt.sys
 
 # Results with the current data combination, lexicon preparation, and acoustic model training procedures.
-# On eval2000 the final GMM results is 24.3, which is better than the above result (24.9). 
-
-multi_a  tri1b  tg_eval2000        ||  %WER 40.3 | 4459 42989 | 63.7 26.1 10.2 4.0 40.3 72.9 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
-multi_a  tri1b  tg_eval2000.si     ||  %WER 45.3 | 4459 42989 | 59.2 29.3 11.4 4.6 45.3 75.4 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000        ||  %WER 33.3 | 4459 42989 | 70.4 21.0 8.6 3.7 33.3 69.6 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3a  tg_eval2000.si     ||  %WER 38.5 | 4459 42989 | 65.9 24.7 9.5 4.4 38.5 72.5 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000        ||  %WER 27.9 | 4459 42989 | 75.8 17.9 6.3 3.7 27.9 67.1 | exp/multi_a/tri3b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_eval2000.si     ||  %WER 31.6 | 4459 42989 | 71.9 20.3 7.8 3.5 31.6 68.8 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.7 | 4459 42989 | 77.2 17.1 5.7 3.9 26.7 65.6 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.6 | 4459 42989 | 73.1 19.6 7.3 3.8 30.6 68.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.5 16.0 5.5 3.4 24.8 63.8 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
-multi_a  tri4   tg_eval2000.si     ||  %WER 31.2 | 4459 42989 | 72.6 20.6 6.8 3.9 31.2 67.6 | exp/multi_a/tri4/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 78.8 15.6 5.6 3.2 24.3 63.3 | exp/multi_a/tri5a/decode_tg_eval2000/score_13_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_eval2000.si     ||  %WER 30.6 | 4459 42989 | 73.7 20.2 6.1 4.3 30.6 67.9 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.2 | 4459 42989 | 79.1 15.6 5.3 3.3 24.2 63.2 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.0/eval2000.ctm.filt.sys
-multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 20.3 6.0 4.2 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000        ||  %WER 24.3 | 4459 42989 | 79.3 15.7 5.0 3.6 24.3 63.5 | exp/multi_a/tri5b/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
-multi_a  tri5b  tg_eval2000.si     ||  %WER 30.7 | 4459 42989 | 73.6 20.4 6.0 4.3 30.7 68.1 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+# On eval2000 the final GMM results is 24.5, which is better than the above result (24.9). 
+multi_a  tri1b  tg_eval2000        ||  %WER 40.4 | 4459 42989 | 63.8 25.9 10.3 4.2 40.4 72.7 | exp/multi_a/tri1b/decode_tg_eval2000/score_10_0.5/eval2000.ctm.filt.sys
+multi_a  tri1b  tg_eval2000.si     ||  %WER 45.0 | 4459 42989 | 59.3 28.8 11.9 4.3 45.0 75.0 | exp/multi_a/tri1b/decode_tg_eval2000.si/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000        ||  %WER 33.4 | 4459 42989 | 70.5 21.3 8.3 3.9 33.4 69.7 | exp/multi_a/tri3a/decode_tg_eval2000/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri3a  tg_eval2000.si     ||  %WER 38.4 | 4459 42989 | 66.2 24.2 9.6 4.6 38.4 72.3 | exp/multi_a/tri3a/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000        ||  %WER 27.8 | 4459 42989 | 75.7 17.8 6.6 3.5 27.8 66.6 | exp/multi_a/tri3b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_eval2000.si     ||  %WER 31.7 | 4459 42989 | 71.8 20.3 7.8 3.6 31.7 69.0 | exp/multi_a/tri3b/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000     ||  %WER 26.8 | 4459 42989 | 77.0 17.3 5.7 3.8 26.8 65.2 | exp/multi_a/tri3b/decode_tg_sp_eval2000/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri3b  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.7 19.7 6.7 4.2 30.5 68.0 | exp/multi_a/tri3b/decode_tg_sp_eval2000.si/score_11_0.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000        ||  %WER 24.8 | 4459 42989 | 78.6 15.8 5.5 3.5 24.8 64.1 | exp/multi_a/tri4/decode_tg_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri4   tg_eval2000.si     ||  %WER 31.3 | 4459 42989 | 73.1 20.8 6.2 4.4 31.3 68.7 | exp/multi_a/tri4/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 79.0 15.7 5.3 3.5 24.5 63.4 | exp/multi_a/tri5a/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_eval2000.si     ||  %WER 30.4 | 4459 42989 | 73.3 20.0 6.6 3.8 30.4 67.5 | exp/multi_a/tri5a/decode_tg_eval2000.si/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000     ||  %WER 24.5 | 4459 42989 | 78.9 15.7 5.4 3.4 24.5 63.4 | exp/multi_a/tri5a/decode_tg_sp_eval2000/score_12_0.5/eval2000.ctm.filt.sys
+multi_a  tri5a  tg_sp_eval2000.si  ||  %WER 30.5 | 4459 42989 | 73.5 20.1 6.5 4.0 30.5 67.8 | exp/multi_a/tri5a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000        ||  %WER 24.4 | 4459 42989 | 79.1 15.6 5.3 3.5 24.4 63.4 | exp/multi_a/tri5b/decode_tg_eval2000/score_12_0.0/eval2000.ctm.filt.sys
+multi_a  tri5b  tg_eval2000.si     ||  %WER 30.5 | 4459 42989 | 73.5 20.2 6.3 4.0 30.5 67.3 | exp/multi_a/tri5b/decode_tg_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000        ||  %WER 24.5 | 4459 42989 | 78.8 15.7 5.5 3.4 24.5 63.0 | exp/multi_a/tri6a/decode_tg_eval2000/score_13_0.5/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_eval2000.si     ||  %WER 31.5 | 4459 42989 | 73.1 21.0 5.9 4.6 31.5 68.1 | exp/multi_a/tri6a/decode_tg_eval2000.si/score_10_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000     ||  %WER 24.6 | 4459 42989 | 78.9 15.8 5.3 3.5 24.6 63.3 | exp/multi_a/tri6a/decode_tg_sp_eval2000/score_12_1.0/eval2000.ctm.filt.sys
+multi_a  tri6a  tg_sp_eval2000.si  ||  %WER 31.5 | 4459 42989 | 72.6 21.0 6.4 4.2 31.5 67.9 | exp/multi_a/tri6a/decode_tg_sp_eval2000.si/score_11_1.0/eval2000.ctm.filt.sys
diff --git a/egs/multi_en/s5/conf/mfcc.conf b/egs/multi_en/s5/conf/mfcc.conf
index 4f780bf520c..9a17e801b3f 100644
--- a/egs/multi_en/s5/conf/mfcc.conf
+++ b/egs/multi_en/s5/conf/mfcc.conf
@@ -2,3 +2,4 @@
 --sample-frequency=8000
 --low-freq=20
 --high-freq=3700
+--allow-downsample=true
diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh
index 88b37f21ad8..f8e50302c29 100755
--- a/egs/multi_en/s5/local/g2p/apply_g2p.sh
+++ b/egs/multi_en/s5/local/g2p/apply_g2p.sh
@@ -33,7 +33,7 @@ cat data/*/train/text | \
   perl -ape 's/\s/\n/g;' | \
   sort | uniq > $workdir/missing.txt
 cat $workdir/missing.txt | \
-  grep "^[a-z0-9.'_-]*$"  > $workdir/missing_onlywords.txt
+  grep "^[a-z]*$"  > $workdir/missing_onlywords.txt
 
 echo 'Synthesizing pronunciations for missing words...'
 phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
diff --git a/egs/multi_en/s5/local/hub4_96_data_prep.sh b/egs/multi_en/s5/local/hub4_96_data_prep.sh
new file mode 100755
index 00000000000..f258ea7b7f5
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_96_data_prep.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# Changes in lower level script/dir names were made
+###########################################################################################
+
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/hub4_96_parse_sgm.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/multi_en/s5/local/hub4_96_parse_sgm.pl b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl
new file mode 100755
index 00000000000..172ec5bb563
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_96_parse_sgm.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# No change was made
+###########################################################################################
+
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}
diff --git a/egs/multi_en/s5/local/hub4_97_data_prep.sh b/egs/multi_en/s5/local/hub4_97_data_prep.sh
new file mode 100755
index 00000000000..096c2142c36
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_97_data_prep.sh
@@ -0,0 +1,50 @@
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# No change was made
+###########################################################################################
+
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1997 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC98S71 
+# /export/corpora/LDC/LDC98T28
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+speech_source_dir=$2  # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/transcrp/*.sgml > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/hub4_97_parse_sgm.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/multi_en/s5/local/hub4_97_parse_sgm.pl b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl
new file mode 100755
index 00000000000..da2344df7c7
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_97_parse_sgm.pl
@@ -0,0 +1,235 @@
+#!/usr/bin/env perl
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
+# The source commit was 191ae0a6e5db19d316c82a78c746bcd56cc2d7da
+# No change was made
+###########################################################################################
+
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+  }
+  close($f);
+}
diff --git a/egs/multi_en/s5/local/hub4_data_prep.py b/egs/multi_en/s5/local/hub4_data_prep.py
deleted file mode 100755
index bc813cfbadd..00000000000
--- a/egs/multi_en/s5/local/hub4_data_prep.py
+++ /dev/null
@@ -1,242 +0,0 @@
-#! /usr/bin/env python
-
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This script prepares the 1996 English Broadcast News (HUB4) corpus.
-https://catalog.ldc.upenn.edu/LDC97S44
-https://catalog.ldc.upenn.edu/LDC97T22
-"""
-
-from __future__ import print_function
-import argparse
-import glob
-import logging
-import os
-import re
-from bs4 import BeautifulSoup
-import hub4_utils
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-handler = logging.StreamHandler()
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
-                              "%(funcName)s - %(levelname)s ] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
-
-def get_args():
-    parser = argparse.ArgumentParser("Prepare BN corpus.")
-    parser.add_argument("--noise-word", type=str, default="<NOISE>",
-                        help="""Replace all noise words in transcript
-                        with this noise_word""")
-    parser.add_argument("--spoken-noise-word", type=str,
-                        default="<SPOKEN_NOISE>",
-                        help="""Replace all speaker noise words in transcript
-                        with this spoken_noise_word""")
-    parser.add_argument("--split-at-sync", type=str,
-                        choices=["true", "false"], default="false",
-                        help="If true, creates separate segments split "
-                        "at each sync tag.")
-    parser.add_argument("audio_source_dir", type=str,
-                        help="Source directory of audio of BN corpus "
-                        "(LDC97S44)")
-    parser.add_argument("text_source_dir", type=str,
-                        help="Source directory of text of BN corpus "
-                        "(LDC97T22)")
-    parser.add_argument("dir", type=str,
-                        help="Output directory to write the kaldi files")
-
-    args = parser.parse_args()
-
-    args.split_at_sync = bool(args.split_at_sync == "true")
-    return args
-
-
-class Segment(object):
-    """A class to store a segment with start time, end time, recording id,
-    speaker, and the text.
-    """
-    def __init__(self, reco_id, speaker=None):
-        self.reco_id = reco_id
-        self.text = None
-        self.start_time = -1
-        self.end_time = -1
-        self.speaker = speaker
-
-    def write_segment(self, out_file):
-        """writes segment in kaldi segments format"""
-        print("{0} {1} {2} {3}".format(self.get_utt_id(), self.reco_id,
-                                       self.start_time, self.end_time),
-              file=out_file)
-
-    def write_utt2spk(self, out_file):
-        """writes speaker information in kaldi utt2spk format"""
-        print("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
-              file=out_file)
-
-    def write_text(self, out_file, noise_word="<NOISE>",
-                   spoken_noise_word="<SPOKEN_NOISE_WORD>"):
-        text = hub4_utils.normalize_bn_transcript(
-            self.text, noise_word, spoken_noise_word)
-        if len(text) == 0 or re.match(r"^\s*$", text):
-            return
-        print("{0} {1}".format(self.get_utt_id(), text), file=out_file)
-
-    def check(self):
-        """checks if this is a valid segment"""
-        assert self.end_time > self.start_time
-
-    def get_utt_id(self):
-        """returns the utterance id created from the recording id and
-        the timing information"""
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        else:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.get_spk_id(), int(self.start_time * 100),
-                int(self.end_time * 100)))
-
-    def get_spk_id(self):
-        if self.speaker is None:
-            return ("{0}-{1:06d}-{2:06d}".format(
-                self.reco_id, int(self.start_time * 100),
-                int(self.end_time * 100)))
-        return "{0}-{1}".format(self.reco_id, self.speaker)
-
-    def duration(self):
-        """returns the duration of the segment"""
-        return self.end_time - self.start_time
-
-
-def process_segment_soup(reco_id, soup, split_at_sync=False):
-    """Processes the input segment soup into a list of objects of class
-    Segment.
-    If split_at_sync is False, then only a segment is created for the soup
-    without consideration to the sync tags.
-    """
-    start_time = float(soup['s_time'])
-    end_time = float(soup['e_time'])
-    speaker = soup['speaker']
-
-    segments = []
-
-    create_new_segment = True
-    for x in soup.children:
-        try:
-            if x.name == "sync":
-                assert not create_new_segment
-                if not split_at_sync:
-                    continue
-                start_time = float(x['time'])
-                segments[-1].end_time = start_time
-                create_new_segment = True
-            elif x.name == "background" or x.name == "comment":
-                continue
-            else:
-                if create_new_segment:
-                    assert split_at_sync or len(segments) == 0
-                    segment = Segment(reco_id, speaker)
-                    segment.text = x.encode('ascii').strip().replace('\n', ' ')
-                    segment.start_time = start_time
-                    segment.end_time = end_time
-                    if segment.duration() > 0:
-                        segments.append(segment)
-                    create_new_segment = False
-                else:
-                    segments[-1].text += (
-                        ' ' + x.encode('ascii').strip().replace('\n', ' '))
-        except Exception:
-            logger.error("Error processing element %s", x)
-            raise
-
-    return segments
-
-
-def process_transcription(transcription_file, segments_handle, utt2spk_handle,
-                          text_handle, split_at_sync=False,
-                          noise_word="<NOISE>",
-                          spoken_noise_word="<SPOKEN_NOISE>"):
-    """Processes transcription file into segments."""
-    doc = ''.join(open(transcription_file).readlines())
-    tag_matcher = re.compile(r"(<(Sync|Background)[^>]+>)")
-    doc_modified = tag_matcher.sub(r"\1</\2>", doc)
-
-    soup = BeautifulSoup(doc_modified, 'lxml')
-
-    reco_id, ext = os.path.splitext(os.path.basename(transcription_file))
-    reco_id = reco_id.strip('_')  # remove trailing underscores in the name
-
-    for episode in soup.find_all("episode"):
-        for section in episode.find_all("section"):
-            s_time = section['s_time']
-            e_time = section['e_time']
-            section_type = section['type']
-
-            logger.debug("Processing section st = %d, end = %d, "
-                         "type = %s", s_time, e_time, section_type)
-
-            for seg in section.find_all("segment"):
-                try:
-                    segments = process_segment_soup(
-                        reco_id, seg, split_at_sync=split_at_sync)
-                    for s in segments:
-                        if s.duration() == 0:
-                            continue
-                        s.write_segment(segments_handle)
-                        s.write_utt2spk(utt2spk_handle)
-                        s.write_text(text_handle, noise_word,
-                                     spoken_noise_word)
-                except Exception:
-                    logger.error("Failed processing segment %s", seg)
-                    raise
-
-
-def run(args):
-    if not os.path.isdir(args.dir):
-        os.makedirs(args.dir)
-
-    with open(os.path.join(args.dir, "wav.scp"), 'w') as wav_scp_handle:
-        for file_ in glob.glob("{0}/{1}/*.sph".format(args.audio_source_dir,
-                                                      "data")):
-            reco, ext = os.path.splitext(os.path.basename(file_))
-            reco = reco.strip('_')
-
-            print("{0} sox {1} -c 1 -r 8000 -t wav - |".format(
-                reco, file_), file=wav_scp_handle)
-
-    segments_handle = open(os.path.join(args.dir, "segments"), 'w')
-    utt2spk_handle = open(os.path.join(args.dir, "utt2spk"), 'w')
-    text_handle = open(os.path.join(args.dir, "text"), 'w')
-    for dir_ in glob.glob("{0}/{1}/*/".format(args.text_source_dir,
-                                              "hub4_eng_train_trans")):
-        for x in glob.glob("{0}/*.txt".format(dir_)):
-            try:
-                process_transcription(x, segments_handle, utt2spk_handle,
-                                      text_handle,
-                                      split_at_sync=args.split_at_sync,
-                                      noise_word=args.noise_word,
-                                      spoken_noise_word=args.spoken_noise_word)
-            except Exception:
-                logger.error("Failed to process file %s",
-                             x)
-                raise
-    segments_handle.close()
-    utt2spk_handle.close()
-    text_handle.close()
-
-
-def main():
-    try:
-        args = get_args()
-        run(args)
-    except Exception:
-        raise
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/multi_en/s5/local/hub4_en_data_prep.sh b/egs/multi_en/s5/local/hub4_en_data_prep.sh
new file mode 100755
index 00000000000..e8173111038
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_en_data_prep.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+# 1996/1997 English Broadcast News training data preparation (HUB4)
+
+# Copyright  2017  Xiaohui Zhang
+#            2017  Vimal Manohar
+# Apache 2.0.
+if [ $# != 4 ]; then
+   echo "Arguments should be a list of HUB4 directories, see ../run.sh for example."
+   exit 1;
+fi
+
+hub4_96_train_transcripts=$1
+hub4_96_train_speech=$2
+hub4_97_train_transcripts=$3
+hub4_97_train_speech=$4
+
+. ./path.sh # Needed for KALDI_ROOT
+###############################################################################
+# Prepare 1996 English Broadcast News Train (HUB4)
+###############################################################################
+local/hub4_96_data_prep.sh \
+  $hub4_96_train_transcripts \
+  $hub4_96_train_speech \
+  data/local/train_bn96
+
+###############################################################################
+# Prepare 1996 English Broadcast News Train (HUB4)
+###############################################################################
+local/hub4_97_data_prep.sh \
+  $hub4_97_train_transcripts \
+  $hub4_97_train_speech \
+  data/local/train_bn97
+
+###############################################################################
+# Format 1996 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/hub4_en/train_bn96
+
+local/hub4_format_data.pl \
+  data/local/train_bn96/audio.list data/local/train_bn96/transcripts.txt \
+  data/hub4_en/train_bn96 || exit 1
+
+mv data/hub4_en/train_bn96/text data/hub4_en/train_bn96/text.unnorm
+local/hub4_normalize_bn96_transcripts.pl "<noise>" "<spoken_noise>" \
+  < data/hub4_en/train_bn96/text.unnorm > data/hub4_en/train_bn96/text
+
+###############################################################################
+# Format 1997 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/hub4_en/train_bn97
+
+local/hub4_format_data.pl \
+  data/local/train_bn97/audio.list data/local/train_bn97/transcripts.txt \
+  data/hub4_en/train_bn97 || exit 1
+
+mv data/hub4_en/train_bn97/text data/hub4_en/train_bn97/text.unnorm
+local/hub4_normalize_bn97_transcripts.pl "<noise>" "<spoken_noise>" \
+  < data/hub4_en/train_bn97/text.unnorm > data/hub4_en/train_bn97/text
+
+# Combine 1996/1997 BN data
+utils/combine_data.sh data/hub4_en/train data/hub4_en/train_bn96 data/hub4_en/train_bn97
diff --git a/egs/multi_en/s5/local/hub4_format_data.pl b/egs/multi_en/s5/local/hub4_format_data.pl
new file mode 100755
index 00000000000..03abe3ff2c5
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_format_data.pl
@@ -0,0 +1,138 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
+# The source commit was 9f61a1b0efa76f37fc29fa2dbeede6dc776a0203
+# Minor script name changes in Usage info were made.
+###########################################################################################
+
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University 
+#                        (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+  print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+  print STDERR "  Usage: $0 <audio-files> <transripts> <destination>\n";
+  print STDERR "  where\n";
+  print STDERR "    <audio-files> is a file containing list of audio files\n";
+  print STDERR "      (single absolute path name per line)\n";
+  print STDERR "    <transcripts> is a file containing transcripts obtained\n";
+  print STDERR "      obtained by processing the official SGML format\n";
+  print STDERR "      transcripts. See local/hub4_{96,97}_parse_sgm.pl for further info.\n";
+  print STDERR "    <destination> target directory (should already exist)\n";
+  print STDERR "  See also: local/hub4_{96,97}_parse_sgm.pl\n";
+  die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files) 
+  or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+  chomp $line;
+  (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+  $basename =~ s/_$//g;
+  $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+  or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+  chomp $line;
+  my @F = split / /, $line, 8;
+  push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+  my $f1 = $F[0];
+  my $f2 = $F[1];
+  my $speaker = $F[2];
+  my $t1 = $F[5];
+  my $t2 = $F[6];
+
+  $time_width = max $time_width, length($t1), length($t2);
+  $speaker_width = max $speaker_width, length($speaker);
+  $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+  die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp") 
+  or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text") 
+  or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments") 
+  or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk") 
+  or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+  print "$0 Error: $file does not exist in transcripts!\n"  
+    unless exists $TRANSCRIPT{$file};
+  my $transcripts = $TRANSCRIPT{$file};
+
+  #my $file_fmt = sprintf("%0${audio_width}s", $file);
+  my $file_fmt = sprintf("%s", $file);
+
+  print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+  foreach my $utt (@{$transcripts}) {
+    my $start = $utt->[5] + 0.0;  
+    my $end = $utt->[6] + 0.0;
+    if ($end - $start < 0.005) {   # remove very short segments
+      next;
+    }
+    my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);  
+    my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+    my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+    # my $spk = sprintf("%s", $utt->[2]);
+    my $spkid = "${file_fmt}_${spk}";
+    my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+    print $text_file "$uttid $utt->[7]\n";
+    print $spk_file "$uttid $spkid\n";
+    print $segments_file "$uttid $file_fmt $start $end\n";
+  }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
new file mode 100755
index 00000000000..81a26ecd5af
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_normalize_bn96_transcripts.pl
@@ -0,0 +1,33 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/normalize_bn96_transcripts.pl
+# The source commit was 3c96a5fdfc31408fdc0128619a5e3ee4f2cfea6f
+# No change was made
+###########################################################################################
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: hub4_normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..b352f8d81b8
--- /dev/null
+++ b/egs/multi_en/s5/local/hub4_normalize_bn97_transcripts.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/env perl
+
+###########################################################################################
+# This script was copied from egs/hub4_english/s5/local/normalize_bn97_transcripts.pl
+# The source commit was 148c060d8593386ee29cfcef8a2a0a050c67bce6
+# No change was made
+###########################################################################################
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: hub4_normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        if ($w ne $noise_word && $w ne $spoken_noise_word) {
+          $w =~ s:[?.,!]+$::;   # Remove punctuations
+          $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+          $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
+          $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:_([A-Z]):$1. :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:[ ]+$::;  # Remove trailing spaces
+        }
+
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/multi_en/s5/local/hub4_utils.py b/egs/multi_en/s5/local/hub4_utils.py
deleted file mode 100644
index b43de80c73b..00000000000
--- a/egs/multi_en/s5/local/hub4_utils.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# Copyright 2016    Vimal Manohar
-# Apache 2.0.
-
-"""This module contains utilities for preparing the HUB4 broadcast news
-evaluation corpora.
-"""
-
-import os
-import re
-import sys
-
-
-def parse_uem_line(reco, line):
-    """This method parses a 'line' from the UEM for recording 'reco'
-    and returns the line converted to kaldi segments format.
-    The format of UEM is
-    <file-id> <channel> <start-time> <end-time>
-
-    We force the channel to be 1 and take the file-id to be the recording-id.
-    """
-    line = line.strip()
-    if len(line) == 0 or line[0:2] == ";;":
-        return None
-    parts = line.split()
-
-    if reco is None:
-        reco = parts[0]
-
-    # The channel ID is expected to be 1.
-    if parts[1] != "1":
-        raise TypeError("Invalid line {0}".format(line))
-
-    start_time = float(parts[2])
-    end_time = float(parts[3])
-
-    utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
-                                       int(end_time * 100))
-    return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
-
-
-def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
-    """This line parses a 'line' from the CMU automatic segmentation for
-    recording.
-    The CMU segmentation has the following format:
-    <file> <channel> <speaker> <start-time> <end-time> <condition>
-
-    We force the channel to be 1 and take the file-id to be the recording-id.
-    """
-    line = line.strip()
-    if len(line) == 0 or line[0:2] == ";;":
-        return None
-    parts = line.split()
-
-    # Actually a file, but we assuming 1-1 mapping to recording and force
-    # channel to be 1.
-    reco = parts[0]
-
-    # The channel ID is expected to be 1.
-    if parts[1] != "1":
-        raise TypeError("Invalid line {0}".format(line))
-    spk = parts[2]
-
-    start_time = float(parts[3])
-    end_time = float(parts[4])
-
-    if prepend_reco_to_spk:
-        spk = reco + '-' + spk
-        utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
-                                             int(end_time * 100), spk=spk)
-    else:
-        utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
-                                                    int(end_time * 100),
-                                                    reco=reco, spk=spk)
-
-    segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
-        utt, reco, st=start_time, end=end_time)
-    utt2spk_line = "{0} {1}".format(utt, spk)
-
-    return (segment_line, utt2spk_line)
-
-
-def normalize_bn_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\{[^}]+\}", spoken_noise_word, text)
-    # Remove mispronunciation brackets
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    text1 = []
-    for word in text.split():
-        # Remove best guesses for proper nouns
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-def normalize_csr_transcript(text, noise_word, spoken_noise_word):
-    """Normalize broadcast news transcript for audio."""
-    text = text.upper()
-
-    # Remove long event markings
-    text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
-    # Remove comments
-    text = re.sub(r"\{\{[^}]*\}\}", "", text)
-    # Replace alternative words with a single one (second alternative)
-    text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
-    # Remove partial word completions
-    text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
-    # Remove accent marks and diacritics
-    text = re.sub(r"\\[3-8]", "", text)
-
-    # Remove unclear speech markings
-    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
-    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
-    # Remove invented word markings
-    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
-    # Replace speaker-made noises with <SPOKEN_NOISE>
-    text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
-                  spoken_noise_word, text)
-    # Replace noise with <NOISE>
-    text = re.sub(r"\[[^]]+\]", noise_word, text)
-    text = re.sub(r"\+([^+]+)\+", r"\1", text)
-
-    # Remove periods after letter.
-    text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
-    # Replace \. with .
-    text = re.sub(r"\\.", r".", text)
-
-    text1 = []
-    for word in text.split():
-        if word == spoken_noise_word or word == noise_word:
-            text1.append(word)
-            continue
-
-        # Remove mispronunciation brackets
-        word = re.sub(r"^@(\w+)$", r"\1", word)
-        # Remove everything other than the standard ASCII symbols
-        word = re.sub("[^A-Za-z0-9.' _-]", "", word)
-        text1.append(word)
-    return " ".join(text1)
-
-
-def remove_punctuations(text):
-    """Remove punctuations and some other processing for text sentence."""
-    # Remove HTML new lines that are not end of sentences
-    text1 = re.sub("\n", " ", text)
-
-    # Remove some markers like double dash that are normally used to separate
-    # name titles in newspapers.
-    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
-
-    # Remove quotation marks
-    text1 = re.sub(r"''|``|\(|\)", " ", text1)
-
-    # Remove everything other than the standard ASCII symbols
-    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
-
-    # Replace multiple .'s with single and then remove isolated '.'
-    text1 = re.sub(r"\.[.]+ ", ".", text1)
-    text1 = re.sub(r" \. ", " ", text1)
-
-    # Remove isolated '-'
-    text1 = re.sub(r" - ", " ", text1)
-
-    # Replace multiple spaces with single.
-    text1 = re.sub(r"[ ]+", " ", text1)
-
-    return text1
diff --git a/egs/multi_en/s5/local/librispeech_data_prep.sh b/egs/multi_en/s5/local/librispeech_data_prep.sh
index cf084df0484..b34072a4f61 100755
--- a/egs/multi_en/s5/local/librispeech_data_prep.sh
+++ b/egs/multi_en/s5/local/librispeech_data_prep.sh
@@ -3,8 +3,7 @@
 ###########################################################################################
 # This script was copied from egs/librispeech/s5/local/data_prep.sh
 # The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
-# Changes made:
-#  - Changed wav.scp to use sox to convert and downsample
+# No change was made.
 ###########################################################################################
 
 # Copyright 2014  Vassil Panayotov
@@ -21,8 +20,8 @@ src=$1
 dst=$2
 
 # all utterances are FLAC compressed
-if ! which sox >&/dev/null; then
-   echo "Please install 'sox' on ALL worker nodes!"
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
    exit 1
 fi
 
@@ -40,7 +39,7 @@ utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk
 spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender
 utt2dur=$dst/utt2dur; [[ -f "$utt2dur" ]] && rm $utt2dur
 
-for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
   reader=$(basename $reader_dir)
   if ! [ $reader -eq $reader ]; then  # not integer.
     echo "$0: unexpected subdirectory name $reader"
@@ -60,8 +59,8 @@ for reader_dir in $(find $src -mindepth 1 -maxdepth 1 -type d | sort); do
       exit 1;
     fi
 
-    find $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
-      awk -v "dir=$chapter_dir" '{printf "%s sox %s/%s.flac -r 8000 -t wavpcm - |\n", $0, dir, $0}' >>$wav_scp|| exit 1
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1
 
     chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
     [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
diff --git a/egs/multi_en/s5/local/make_partitions.sh b/egs/multi_en/s5/local/make_partitions.sh
index 188060e3258..bf0915029f2 100755
--- a/egs/multi_en/s5/local/make_partitions.sh
+++ b/egs/multi_en/s5/local/make_partitions.sh
@@ -87,19 +87,18 @@ if [ $stage -eq 8 ]; then
   ln -nfs tri5a_ali $data_dir/tri5b
 fi
 
-# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech460 (nodup)
+# whole fisher + swbd + tedlilum + wsj + hub4_en + librispeech960 (nodup)
 if [ $stage -eq 9 ]; then
   utils/combine_data.sh $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 \
     $data_dir/fisher_swbd_tedlium_wsj_hub4_libri460 data/librispeech_500/train \
     || { echo "Failed to combine data"; exit 1; }
   utils/data/remove_dup_utts.sh 300 $data_dir/fisher_swbd_tedlium_wsj_hub4_libri960 $data_dir/tri5b_ali
-  ln -nfs tri6a $data_dir/tri5b_ali
-  ln -nfs tri6a $data_dir/tri6a_ali
-  ln -nfs tri6a $data_dir/tri6b
+  ln -nfs tri5b_ali $data_dir/tri6a
+  ln -nfs tri5b_ali $data_dir/tri6a_ali
 fi
 
+# sampled data for ivector extractor training,.etc
 if [ $stage -eq 10 ]; then
-  ln -nfs tri6a $data_dir/tr6b_ali
   ln -nfs tri6a $data_dir/tdnn
   utils/subset_data_dir.sh $data_dir/tdnn \
     100000 $data_dir/tdnn_100k
diff --git a/egs/multi_en/s5/local/swbd1_data_prep.sh b/egs/multi_en/s5/local/swbd1_data_prep.sh
index ade30c412b3..4c1b6c7a9e6 100755
--- a/egs/multi_en/s5/local/swbd1_data_prep.sh
+++ b/egs/multi_en/s5/local/swbd1_data_prep.sh
@@ -2,11 +2,10 @@
 
 ###########################################################################################
 # This script was copied from egs/fisher_swbd/s5/local/swbd1_data_prep.sh
-# The source commit was e69198c3dc5633f98eb88e1cdf20b2521a598f21
+# The source commit was edb1aae9457f6441a224dbc451bb8c5220dfefc7
 # Changes made:
 #  - Specified path to path.sh
 #  - Modified paths to match multi_en naming conventions
-#  - Deleted acronym formatting step
 ###########################################################################################
 
 # Switchboard-1 training data preparation customized for Edinburgh
@@ -91,6 +90,13 @@ cat $dir/transcripts1.txt \
 # case insensitive
 local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final transcripts
 
+# format acronyms in text
+python local/swbd_map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
+  -M data/local/dict_swbd/acronyms.map
+cp $dir/text $dir/text_bk
+mv $dir/text_map $dir/text
+
+
 
 # (1c) Make segment files from transcript
 #segments file format is: utt-id side-id start-time end-time, e.g.:
diff --git a/egs/multi_en/s5/local/tedlium_prepare_data.sh b/egs/multi_en/s5/local/tedlium_prepare_data.sh
index bca37ee173c..22f79b0b117 100755
--- a/egs/multi_en/s5/local/tedlium_prepare_data.sh
+++ b/egs/multi_en/s5/local/tedlium_prepare_data.sh
@@ -6,7 +6,6 @@
 # Changes made:
 #  - Specified path to path.sh
 #  - Modified paths to match multi_en naming conventions
-#  - Changed wav.scp to use sox to convert and downsample
 ###########################################################################################
 
 #
@@ -67,7 +66,7 @@ for set in dev test train; do
   cat $dir/utt2spk | utils/utt2spk_to_spk2utt.pl > $dir/spk2utt
 
   # Prepare 'wav.scp', 'reco2file_and_channel'
-  cat $dir/spk2utt | awk -v set=$set '{ printf("%s sox '"$data_src"'/%s/sph/%s.sph -r 8000 -t wavpcm - |\n", $1, set, $1); }' > $dir/wav.scp
+  cat $dir/spk2utt | awk -v set=$set '{ printf("%s sph2pipe -f wav -p '"$data_src"'/%s/sph/%s.sph |\n", $1, set, $1); }' > $dir/wav.scp
   cat $dir/wav.scp | awk '{ print $1, $1, "A"; }' > $dir/reco2file_and_channel
 
   # Create empty 'glm' file
diff --git a/egs/multi_en/s5/local/wsj_data_prep.sh b/egs/multi_en/s5/local/wsj_data_prep.sh
index 5e50b009839..cc11f179eca 100755
--- a/egs/multi_en/s5/local/wsj_data_prep.sh
+++ b/egs/multi_en/s5/local/wsj_data_prep.sh
@@ -6,7 +6,6 @@
 # Changes made:
 #  - Modified paths to match multi_en naming conventions
 #  - Removed code related to LM creation (happens after utt2spk creation)
-#  - Changed wav.scp to downsample to 8 kHz
 ###########################################################################################
 
 # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
@@ -136,7 +135,7 @@ done
 
 # Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
 for x in train_si84 train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
-  awk '{printf("%s '$sph2pipe' -f wav %s | sox -t raw -r 16000 -e signed-integer -b 16 - -t wav -r 8000 - | \n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
+  awk '{printf("%s '$sph2pipe' -f wav %s |\n", $1, $2);}' < ${x}_sph.scp > ${x}_wav.scp
 done
 
 # Make the utt2spk and spk2utt files.
diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh
index 76f1a6ccc8e..3a1262101aa 100755
--- a/egs/multi_en/s5/run.sh
+++ b/egs/multi_en/s5/run.sh
@@ -18,7 +18,7 @@ wsj1=
 eval2000=
 rt03=
 
-
+set -e
 # check for kaldi_lm
 which get_word_map.pl > /dev/null
 if [ $? -ne 0 ]; then
@@ -38,7 +38,8 @@ case $(hostname -d) in
     wsj1=/export/corpora5/LDC/LDC94S13B
     eval2000="/export/corpora/LDC/LDC2002S09/hub5e_00 /export/corpora/LDC/LDC2002T43"
     rt03="/export/corpora/LDC/LDC2007S10"
-    hub4_en="/export/corpora/LDC/LDC97S44 /export/corpora/LDC/LDC97T22"
+    hub4_en_96="/export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data"
+    hub4_en_97="/export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4"
     ;;
 esac
 
@@ -50,14 +51,25 @@ srilm_opts="-subset -prune-lowprobs -unk -tolower -order 3"
 
 . utils/parse_options.sh
 
-# prepare corpora data
+# Prepare the basic dictionary (a combination of swbd+CMU+tedlium lexicons) in data/local/dict_combined.
+# and train a G2P model using the combined lexicon
+# in data/local/dict_combined
 if [ $stage -le 1 ]; then
+  # We prepare the basic dictionary in data/local/dict_combined.
+  local/prepare_dict.sh $swbd $tedlium2
+  (
+   local/g2p/train_g2p.sh --stage 0 --silence-phones \
+     "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
+  ) &
+fi
+
+# Prepare corpora data
+if [ $stage -le 2 ]; then
   mkdir -p data/local
   # fisher
   local/fisher_data_prep.sh $fisher
   utils/fix_data_dir.sh data/fisher/train
   # swbd
-  local/swbd1_data_download.sh $swbd
   local/swbd1_data_prep.sh $swbd
   utils/fix_data_dir.sh data/swbd/train
   # librispeech
@@ -71,20 +83,13 @@ if [ $stage -le 1 ]; then
   local/wsj_data_prep.sh $wsj0/??-{?,??}.? $wsj1/??-{?,??}.?
   local/wsj_format_data.sh
   utils/copy_data_dir.sh --spk_prefix wsj_ --utt_prefix wsj_ data/wsj/train_si284 data/wsj/train
-  rm -rf data/wsj/train_si284
+  rm -r data/wsj/train_si284 2>/dev/null || true
   # hub4_en
-  local/hub4_data_prep.py --noise-word="[NOISE]" \
-    --spoken-noise-word="[VOCALIZED-NOISE]" \
-    $hub4_en data/hub4_en/train
-fi
-
-# prepare standalone eval data
-if [ $stage -le 2 ]; then
-  mkdir -p data/local
-  # eval2000
+  local/hub4_en_data_prep.sh $hub4_en_96 $hub4_en_97
+  # eval2000 (test)
   local/eval2000_data_prep.sh $eval2000
   utils/fix_data_dir.sh data/eval2000/test
-  # rt03
+  # rt03 (test)
   local/rt03_data_prep.sh $rt03
   utils/fix_data_dir.sh data/rt03/test
 fi
@@ -98,17 +103,19 @@ if [ $stage -le 3 ]; then
   done
 fi
 
-# Prepare the dictionary and train G2P model using the combined (CMUDict+Tedlium+swbd) lexicon
-# in data/local/dict_combined, and then synthesize pronounciations for all OOV words 
-# across all training transcripts.
+# Synthesize pronounciations for OOV words across all training transcripts and produce the final lexicon.
 if [ $stage -le 4 ]; then
-  # We prepare the dictionary in data/local/dict_combined.
-  local/prepare_dict.sh $swbd $tedlium2
-  local/g2p/train_g2p.sh --stage 0 --silence-phones "data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p
+  wait # Waiting for train_g2p.sh to finish
+  if [ -f exp/g2p/.error ]; then
+     rm exp/g2p/.error || true
+     echo "Fail to train the G2P model." && exit 1;
+  fi
   dict_dir=data/local/dict_nosp
   mkdir -p $dict_dir
+  rm $dict_dir/lexiconp.txt 2>/dev/null || true
   cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
-  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
+    data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;
 fi
 
 # We'll do multiple iterations of pron/sil-prob estimation. So the structure of
@@ -137,19 +144,26 @@ if [ $stage -le 7 ]; then
   mfccdir=mfcc
   corpora="hub4_en fisher librispeech_100 librispeech_360 librispeech_500 swbd tedlium wsj"
   for c in $corpora; do
-    data=data/$c/train
-    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
-      --cmd "$train_cmd" --nj 40 \
-      $data exp/make_mfcc/$c/train || exit 1;
-    steps/compute_cmvn_stats.sh \
-      $data exp/make_mfcc/$c/train || exit 1;
+    (
+     data=data/$c/train
+     steps/make_mfcc.sh --mfcc-config conf/mfcc.conf \
+       --cmd "$train_cmd" --nj 40 \
+       $data exp/make_mfcc/$c/train || touch $data/.error
+     steps/compute_cmvn_stats.sh \
+       $data exp/make_mfcc/$c/train || touch $data/.error
+    ) &
   done
+  wait
+  if [ -f $data/.error ]; then
+     rm $data/.error || true
+     echo "Fail to extract features." && exit 1;
+  fi
 fi
 
 # fix and validate training data directories
 if [ $stage -le 8 ]; then
   # get rid of spk2gender files because not all corpora have them
-  rm -f data/*/train/spk2gender
+  rm data/*/train/spk2gender 2>/dev/null || true
   # create reco2channel_and_file files for wsj and librispeech
   for c in wsj librispeech_100 librispeech_360 librispeech_500; do
     awk '{print $1, $1, "A"}' data/$c/train/wav.scp > data/$c/train/reco2file_and_channel;
@@ -403,48 +417,3 @@ if [ $stage -le 22 ]; then
     done
   )&
 fi
-
-# reestimate LM with silprobs
-dict_affix=${multi}_tri6a
-if [ $stage -le 23 ]; then
-  gmm=tri6a
-  steps/get_prons.sh --cmd "$train_cmd" data/$multi/$gmm ${lang_root}_nosp exp/$multi/$gmm
-  utils/dict_dir_add_pronprobs.sh --max-normalize true \
-    ${dict_root}_nosp exp/$multi/$gmm/pron_counts_nowb.txt \
-    exp/$multi/$gmm/sil_counts_nowb.txt exp/$multi/$gmm/pron_bigram_counts_nowb.txt ${dict_root}_${dict_affix}
-  utils/prepare_lang.sh ${dict_root}_${dict_affix} "<unk>" data/local/lang_${dict_affix} ${lang_root}_${dict_affix}
-  utils/format_lm_sri.sh --srilm-opts "$srilm_opts" \
-    ${lang_root}_${dict_affix} data/local/lm/3gram-mincount/lm_unpruned.gz \
-    ${dict_root}_${dict_affix}/lexicon.txt ${lang_root}_${dict_affix}_fsh_sw1_tg
-  # re-decode after re-estimating sil & pron-probs
-  (  
-    gmm=tri6a
-    graph_dir=exp/$multi/$gmm/graph_tg_sp
-    utils/mkgraph.sh ${lang_root}_${dict_affix}_fsh_sw1_tg \
-      exp/$multi/$gmm $graph_dir || exit 1;
-    for e in eval2000 rt03; do
-      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config $graph_dir \
-        data/$e/test exp/$multi/$gmm/decode_tg_sp_$e || exit 1;
-    done
-  )&
-fi
-
-# Re-train with the updated lexicon using the same data.
-lang=${lang_root}_${dict_affix}
-if [ $stage -le 24 ]; then
-  steps/align_fmllr.sh --cmd "$train_cmd" --nj 100 \
-    data/$multi/tri6a_ali $lang \
-    exp/$multi/tri6a exp/$multi/tri6a_ali || exit 1;
-  steps/train_sat.sh --cmd "$train_cmd" 14000 2400000 \
-    data/$multi/tri6b $lang exp/$multi/tri6a_ali exp/$multi/tri6b || exit 1;
-  (  
-    gmm=tri6b
-    graph_dir=exp/$multi/$gmm/graph_tg
-    utils/mkgraph.sh ${lang}_fsh_sw1_tg \
-      exp/$multi/$gmm $graph_dir || exit 1;
-    for e in eval2000 rt03; do
-      steps/decode_fmllr.sh --nj 25 --cmd "$decode_cmd" --config conf/decode.config $graph_dir \
-        data/$e/test exp/$multi/$gmm/decode_tg_$e || exit 1;
-    done
-  )&
-fi
diff --git a/egs/swbd/s5c/local/score_basic.sh b/egs/swbd/s5c/local/score_basic.sh
index ff9521b7727..2cb1fafc8d8 100755
--- a/egs/swbd/s5c/local/score_basic.sh
+++ b/egs/swbd/s5c/local/score_basic.sh
@@ -47,8 +47,9 @@ function filter_text {
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
   $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.${wip}.log \
+    lattice-add-penalty --word-ins-penalty=$wip "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
     lattice-best-path --lm-scale=LMWT --word-symbol-table=$lang/words.txt \
-    "ark:gunzip -c $dir/lat.*.gz|" ark,t:$dir/scoring/LMWT.${wip}.tra || exit 1;
+      ark:- ark,t:$dir/scoring/LMWT.${wip}.tra || exit 1;
 done
 
 for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
diff --git a/egs/wsj/s5/utils/data/resample_data_dir.sh b/egs/wsj/s5/utils/data/resample_data_dir.sh
index 1e3c31b80c5..b972bcc119a 100755
--- a/egs/wsj/s5/utils/data/resample_data_dir.sh
+++ b/egs/wsj/s5/utils/data/resample_data_dir.sh
@@ -1,6 +1,7 @@
 #! /bin/bash
 
 # Copyright 2016  Vimal Manohar
+#           2018  Xiaohui Zhang
 # Apache 2.0.
 
 if [ $# -ne 2 ]; then
@@ -25,6 +26,12 @@ if [ -f $dir/feats.scp ]; then
   echo "$0: feats.scp already exists. Moving it to $dir/.backup"
 fi
 
+# After resampling we cannot compute utt2dur from wav.scp any more,
+# so we create utt2dur now, in case it's needed later
+if [ ! -s $dir/utt2dur ]; then
+  utils/data/get_utt2dur.sh $dir 1>&2 || exit 1;
+fi
+
 mv $dir/wav.scp $dir/wav.scp.tmp
 cat $dir/wav.scp.tmp | python -c "import sys
 for line in sys.stdin.readlines():

From f2ab7d4034230ee9e7fbf6f4b004e1d6d987651a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 8 Feb 2018 14:03:58 -0500
Subject: [PATCH 116/184] [src] Fix small bug in Log1p (thanks: Max Lvov)

---
 src/base/kaldi-math.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/base/kaldi-math.h b/src/base/kaldi-math.h
index 4d7f4218648..afddc5105d4 100644
--- a/src/base/kaldi-math.h
+++ b/src/base/kaldi-math.h
@@ -107,7 +107,7 @@ inline float Log1p(float x) {  return log1pf(x); }
 inline double Log1p(double x) {
   const double cutoff = 1.0e-08;
   if (x < cutoff)
-    return x - 2 * x * x;
+    return x - 0.5 * x * x;
   else
     return Log(1.0 + x);
 }
@@ -115,7 +115,7 @@ inline double Log1p(double x) {
 inline float Log1p(float x) {
   const float cutoff = 1.0e-07;
   if (x < cutoff)
-    return x - 2 * x * x;
+    return x - 0.5 * x * x;
   else
     return Log(1.0 + x);
 }

From 1f1b1089cb258564a2898984d6d13ba59dda2ec4 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Thu, 8 Feb 2018 15:26:36 -0500
Subject: [PATCH 117/184] [scripts] make format_lm.sh work when  source ==
 target directory (#2209)

---
 egs/wsj/s5/utils/format_lm.sh | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/egs/wsj/s5/utils/format_lm.sh b/egs/wsj/s5/utils/format_lm.sh
index 3e91dfa9a85..1558c44cb51 100755
--- a/egs/wsj/s5/utils/format_lm.sh
+++ b/egs/wsj/s5/utils/format_lm.sh
@@ -36,14 +36,20 @@ mkdir -p $out_dir
 
 echo "Converting '$lm' to FST"
 
-if [ -e $out_dir/phones ]; then
-  rm -r $out_dir/phones
+# the -ef test checks if  source and target directory
+# are two different directories in the filesystem
+# if they are the same, the section guarded by the test
+# would be actually harmfull (deleting the phones/ subdirectory)
+if [ -e $out_dir ] && [ ! $lang_dir -ef $out_dir ] ; then
+  if [ -e $out_dir/phones ] ; then
+    rm -r $out_dir/phones
+  fi
+
+  for f in phones.txt words.txt topo L.fst L_disambig.fst phones oov.int oov.txt; do
+     cp -r $lang_dir/$f $out_dir
+  done
 fi
 
-for f in phones.txt words.txt topo L.fst L_disambig.fst phones oov.int oov.txt; do
-  cp -r $lang_dir/$f $out_dir
-done
-
 lm_base=$(basename $lm '.gz')
 gunzip -c $lm \
   | arpa2fst --disambig-symbol=#0 \

From c52ee4c522a2c91fa15f4ba0edeb53aa376e25bb Mon Sep 17 00:00:00 2001
From: Ke Li <kli26@jhu.edu>
Date: Fri, 9 Feb 2018 01:19:59 -0500
Subject: [PATCH 118/184] [src] rnnlm-embedding-training bug fix (thanks:
 @yanglin187) (#2208) (#2210)

---
 src/rnnlm/rnnlm-embedding-training.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc
index 00d939da5be..f490f490f61 100644
--- a/src/rnnlm/rnnlm-embedding-training.cc
+++ b/src/rnnlm/rnnlm-embedding-training.cc
@@ -128,7 +128,7 @@ void RnnlmEmbeddingTrainer::Train(
   if (config_.l2_regularize > 0.0) {
     BaseFloat l2_term = -2 * config_.l2_regularize;
     if (l2_term != 0.0) {
-      embedding_deriv->AddMat(l2_term, *embedding_mat_);
+      embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_);
     }
   } 
   BaseFloat scale = 1.0;

From ca73b1f5a870e76e41ff2b9a6b016221a4a85df3 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Sat, 10 Feb 2018 16:03:31 -0500
Subject: [PATCH 119/184] [scripts] Fix warning from create_data_links.pl
 (#2212)

---
 egs/wsj/s5/utils/create_data_link.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/utils/create_data_link.pl b/egs/wsj/s5/utils/create_data_link.pl
index 40c17487310..850f29f068c 100755
--- a/egs/wsj/s5/utils/create_data_link.pl
+++ b/egs/wsj/s5/utils/create_data_link.pl
@@ -79,7 +79,7 @@ sub GetGCD {
 
 # Second, get the coprime list.
 my @coprimes;
-for (my $n = 1; $n < $num_storage; $n++) {
+for (my $n = 1; $n <= $num_storage; $n++) {
   if (GetGCD($n, $num_storage) == 1) {
     push(@coprimes, $n);
   }

From b50ec89ceadba28523ab0942324f69d39d80466d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 10 Feb 2018 22:15:31 -0500
Subject: [PATCH 120/184] [scripts] Some cleanup, removing unnecessary
 features/changes vs. master

---
 .../s5c/local/chain/tuning/run_tdnn_7m23t.sh  |  1 -
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 23 ++----
 egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py   | 78 +++++--------------
 3 files changed, 23 insertions(+), 79 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
index f912b2d1175..08f85ef77cb 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
@@ -5,7 +5,6 @@
 # seems slightly better than 23h, and it's nice that it has fewer parameters.
 
 
-
 # local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
 # System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
 # WER on train_dev(tg)      12.28     11.95     12.18
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a6de5d163c0..47a5dfdf082 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -657,13 +657,9 @@ class XconfigBasicLayer(XconfigLayerBase):
       input='[-1]'             [Descriptor giving the input of the layer.]
       dim=-1                   [Output dimension of layer, e.g. 1024]
       bottleneck-dim=-1        [If you set this, a linear bottleneck is added, so
-                                we project to first bottleneck-dim then to dim.  One
-                                of the two matrices is constrained to be orthonormal;
-                                see 'second-matrix-orthonormal'.]
-      second-matrix-orthonormal=False   [Only makes a difference if bottleneck-dim>0.
-                                  You can set this to true if you want the orthormal-rows
-                                  constraint to be applied to the 2nd, not the first, of
-                                  the two marices.]
+                                we project to first bottleneck-dim then to dim.  The
+                                first of the two matrices is constrained to be
+                                orthonormal.]
       self-repair-scale=1.0e-05  [Affects relu, sigmoid and tanh layers.]
       learning-rate-factor=1.0   [This can be used to make the affine component
                                   train faster or slower].
@@ -684,7 +680,6 @@ def set_default_configs(self):
         self.config = {'input': '[-1]',
                        'dim': -1,
                        'bottleneck-dim': -1,
-                       'second-matrix-orthonormal': False,
                        'self-repair-scale': 1.0e-05,
                        'target-rms': 1.0,
                        'ng-affine-options': '',
@@ -799,12 +794,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                 value = self.config[opt_name]
                 if value != '':
                     linear_options += ' {0}={1}'.format(opt_name, value)
-            if not self.config['second-matrix-orthonormal']:
-                linear_options += ' orthonormal-constraint=1.0'
+
             bottleneck_dim = self.config['bottleneck-dim']
             # note: by default the LinearComponent uses natural gradient.
             line = ('component name={0}.linear type=LinearComponent '
-                    'input-dim={1} output-dim={2} {3}'
+                    'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}'
                     ''.format(self.name, input_dim, bottleneck_dim, linear_options))
             configs.append(line)
             line = ('component-node name={0}.linear component={0}.linear input={1}'
@@ -814,13 +808,6 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             cur_dim = bottleneck_dim
 
 
-        if self.config['second-matrix-orthonormal']:
-            assert self.config['bottleneck-dim'] > 0
-            # we have to mess with the range of the parameters so they are within
-            # the circle of convergence...
-            affine_options += ' orthonormal-constraint=1.0 param-stddev={0}'.format(
-                math.sqrt(1.0 / output_dim))
-
         line = ('component name={0}.affine type=NaturalGradientAffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, cur_dim, output_dim, affine_options))
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
index 131acc254dd..a7808131a4a 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/lstm.py
@@ -609,19 +609,12 @@ def set_default_configs(self):
                         'clipping-threshold' : 30.0,
                         'zeroing-interval' : 20,
                         'zeroing-threshold' : 15.0,
-                        # self-scale is a scale we put on the m_t when doing
-                        # linear projections from it... making it larger than 1
-                        # (e.g. 4) helps equalize scales.
-                        'self-scale': 1.0,
                         'delay' : -1,
                         # if you want to set 'self-repair-scale' (c.f. the
                         # self-repair-scale-nonlinearity config value in older LSTM layers), you can
                         # add 'self-repair-scale=xxx' to
                         # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
-                        # if self-stabilize=true, the W_all will be a
-                        # LinearComponent followed by a ScaleAndOffsetComponent.
-                         'self-stabilize': False,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
@@ -720,17 +713,9 @@ def _generate_lstm_config(self):
         configs.append("### Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
 
-        if not self.config['self-stabilize']:
-            configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
-        else:
-            configs.append("component name={0}.W_all type=LinearComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
-                                                           affine_str, l2_regularize_option))
-            configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} "
-                           "max-change=0.75".format(name, cell_dim * 4))
-
+        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                       "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4,
+                                                       affine_str, l2_regularize_option))
 
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
@@ -738,32 +723,25 @@ def _generate_lstm_config(self):
         configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent "
                        "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str,
                                                      l2_regularize_option))
-        # Note from Dan: I don't remember why we are applying the backprop
-        # truncation on both c and m appended together, instead of just on c.
-        # Possibly there was some memory or speed or WER reason for it which I
-        # have forgotten about now.
+
         configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.")
         configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} "
                        "{2}".format(name, 2 * cell_dim, bptrunc_str))
 
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
-                       "IfDefined(Offset(Scale({2}, {0}.c_trunc), {3})))".format(
-                          name, input_descriptor, self.config['self-scale'], delay))
-        if self.config['self-stabilize']:
-            configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
-            W_all_name = 'W_all_so'
-        else:
-            W_all_name = 'W_all'
+                       "IfDefined(Offset({0}.m_trunc, {2})))".format(
+                           name, input_descriptor, delay))
 
         configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                       "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
-                           name, W_all_name, delay))
+                       "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                           name, delay))
         # we can print .c later if needed, but it generates a warning since it's not used.  could use c_trunc instead
         #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim))
         configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name))
         configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim))
+        configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim))
 
         if self.layer_type == "fast-lstm-batchnorm-layer":
             # Add the batchnorm component, if requested to include batchnorm.
@@ -1030,11 +1008,6 @@ def set_default_configs(self):
                         # add 'self-repair-scale=xxx' to
                         # lstm-nonlinearity-options.
                         'lstm-nonlinearity-options' : ' max-change=0.75',
-                        # If you set 'self-stabilize=true', for W_all_a, instead
-                        # of a NaturalGradientAffineComponent, it has a LinearComponent followed
-                        # by a ScaleAndOffsetComponent.  This is similar to
-                        # "SELF-STABILIZED DEEP NEURAL NETWORK" by Ghahremani and Droppo.
-                        'self-stabilize': False,
                         # the affine layer contains 4 of our old layers -> use a
                         # larger max-change than the normal value of 0.75.
                         'ng-affine-options' : ' max-change=1.5',
@@ -1155,20 +1128,10 @@ def _generate_lstm_config(self):
         # <layer-name>.W_<outputname>.<input_name> e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r]
         configs.append("##  Begin LTSM layer '{0}'".format(name))
         configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.")
-        if self.config['self-stabilize']:
-            # have LinearComponent followed by ScaleAndOffsetComponent.
-            configs.append("component name={0}.W_all type=LinearComponent input-dim={1} "
-                           "output-dim={2} {3} {4} ".format(
-                               name, input_dim + rec_proj_dim, cell_dim * 4,
-                               affine_str, l2_regularize_option))
-            configs.append("component name={0}.W_all_so type=ScaleAndOffsetComponent dim={1} "
-                           "max-change=0.75".format(name, cell_dim * 4))
-        else:
-            # have NaturalGradientAffineComponent
-            configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
-                           "output-dim={2} {3} {4}".format(
-                               name, input_dim + rec_proj_dim, cell_dim * 4,
-                               affine_str, l2_regularize_option))
+        configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} "
+                       "output-dim={2} {3} {4}".format(
+                           name, input_dim + rec_proj_dim, cell_dim * 4,
+                           affine_str, l2_regularize_option))
         configs.append("# The core LSTM nonlinearity, implemented as a single component.")
         configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)")
         configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.")
@@ -1186,18 +1149,13 @@ def _generate_lstm_config(self):
                            .format(name, dropout_proportion))
         configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent");
         configs.append("# and non-recurrent projections")
-        configs.append("component name={0}.W_rp type=LinearComponent orthonormal-constraint=2.0 "
+        configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent "
                        "input-dim={1} output-dim={2} {3} {4}".format(
                            name, cell_dim, rec_proj_dim + nonrec_proj_dim,
                            affine_str, l2_regularize_option))
         configs.append("###  Nodes for the components above.")
         configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, "
                        "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay))
-        if self.config['self-stabilize']:
-            configs.append("component-node name={0}.W_all_so component={0}.W_all_so input={0}.W_all".format(name))
-            W_all_name = 'W_all_so'
-        else:
-            W_all_name = 'W_all'
 
         if dropout_proportion != -1.0:
             # note: the 'input' is a don't-care as the component never uses it; it's required
@@ -1205,12 +1163,12 @@ def _generate_lstm_config(self):
             configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask "
                            "input={0}.dropout_mask".format(name))
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})), {0}.dropout_mask)"
-                           .format(name, W_all_name, delay))
+                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), "
+                           "{0}.dropout_mask)".format(name, delay))
         else:
             configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin "
-                           "input=Append({0}.{1}, IfDefined(Offset({0}.c_trunc, {2})))".format(
-                               name, W_all_name, delay))
+                           "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format(
+                               name, delay))
         configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin "
                        "dim-offset=0 dim={1}".format(name, cell_dim))
         configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin "

From 611aa612c5480f62efc7a2fbc94d16d9eeda7c75 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 11 Feb 2018 16:13:12 -0500
Subject: [PATCH 121/184] [scripts,egs] Fix failures in some
 run_ivector_common.sh script introduced in #2205 (#2213)

---
 .../s5c/local/nnet3/run_ivector_common.sh     |  8 ++--
 .../utils/data/perturb_data_dir_speed_3way.sh | 47 ++++++++++++++-----
 2 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
index d45095ec85b..e0b9af96b8c 100755
--- a/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
+++ b/egs/swbd/s5c/local/nnet3/run_ivector_common.sh
@@ -17,10 +17,12 @@ if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
 
 if $speed_perturb; then
   if [ $stage -le 1 ]; then
-    # Although the nnet will be trained by high resolution data, we still have to perturb the normal data to get the alignments
-    # _sp stands for speed-perturbed
+    # Although the nnet will be trained by high resolution data, we still have
+    # to perturb the normal data to get the alignments _sp stands for
+    # speed-perturbed
     echo "$0: preparing directory for speed-perturbed data"
-    utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+    utils/data/perturb_data_dir_speed_3way.sh --always-include-prefix true \
+           data/${train_set} data/${train_set}_sp
 
     echo "$0: creating MFCC features for low-resolution speed-perturbed data"
     mfccdir=mfcc_perturbed
diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
index 048220d62fd..f857ae2bdd7 100755
--- a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
+++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
@@ -1,20 +1,36 @@
 #!/bin/bash
 
-# Copyright 2016  Johns Hopkins University (author: Daniel Povey)
+# Copyright 2016-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Hossein Hadian
 
 # Apache 2.0
 
 # This script does the standard 3-way speed perturbing of
 # a data directory (it operates on the wav.scp).
 
+# If you add the option "--always-include-prefix true", it will include the
+# prefix "sp1.0-" for the original un-perturbed data.  This can help resolve
+# problems with sorting.
+# We don't make '--always-include-prefix true' the default  behavior because
+# it can break some older scripts that relied on the original utterance-ids
+# being a subset of the perturbed data's utterance-ids.
+
+always_include_prefix=false
+
 . utils/parse_options.sh
 
 if [ $# != 2 ]; then
   echo "Usage: perturb_data_dir_speed_3way.sh <srcdir> <destdir>"
   echo "Applies standard 3-way speed perturbation using factors of 0.9, 1.0 and 1.1."
   echo "e.g.:"
-  echo " $0 data/train data/train_sp"
+  echo " $0 [options] data/train data/train_sp"
   echo "Note: if <destdir>/feats.scp already exists, this will refuse to run."
+  echo "Options:"
+  echo "    --always-include-prefix [true|false]   # default: false.  If set to true,"
+  echo "                                           # it will add the prefix 'sp1.0-' to"
+  echo "                                           # utterance and speaker-ids for data at"
+  echo "                                           # the original speed.  Can resolve"
+  echo "                                           # issues RE data sorting."
   exit 1
 fi
 
@@ -39,16 +55,25 @@ utils/data/get_utt2dur.sh ${srcdir}
 utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
 
-utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0
-if [ ! -f $srcdir/utt2uniq ]; then
-  cat $srcdir/utt2spk | awk  '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq
+if $always_include_prefix; then
+  utils/copy_data_dir.sh --spk-prefix sp1.0- --utt-prefix sp1.0- ${srcdir} ${destdir}_speed1.0
+  if [ ! -f $srcdir/utt2uniq ]; then
+    cat $srcdir/utt2spk | awk  '{printf("sp1.0-%s %s\n", $1, $1);}' > ${destdir}_speed1.0/utt2uniq
+  else
+    cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq
+  fi
+  utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+
+  rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0
 else
-  cat $srcdir/utt2uniq | awk '{printf("sp1.0-%s %s\n", $1, $2);}' > ${destdir}_speed1.0/utt2uniq
+  utils/data/combine_data.sh $destdir ${srcdir} ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
+  rm -r ${destdir}_speed0.9 ${destdir}_speed1.1
 fi
 
-utils/data/combine_data.sh $destdir ${destdir}_speed1.0 ${destdir}_speed0.9 ${destdir}_speed1.1 || exit 1
-
-rm -r ${destdir}_speed0.9 ${destdir}_speed1.1 ${destdir}_speed1.0
-
 echo "$0: generated 3-way speed-perturbed version of data in $srcdir, in $destdir"
-utils/validate_data_dir.sh --no-feats --no-text $destdir
+if ! utils/validate_data_dir.sh --no-feats --no-text $destdir; then
+  echo "$0: Validation failed.  If it is a sorting issue, try the option '--always-include-prefix true'."
+  exit 1
+fi
+
+exit 0

From 8a384379bdab3fd1b5712233039ec0747486c291 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Sun, 11 Feb 2018 18:47:00 -0500
Subject: [PATCH 122/184] [egs,scripts]  Recipe for voice activity detection on
 Switchboard (#2193)

---
 egs/swbd/s5c/local/run_asr_segmentation.sh    | 151 +++++++++++++-----
 .../s5c/local/run_cleanup_segmentation.sh     |  58 +++++++
 .../tuning/train_lstm_asr_sad_1a.sh           |   7 +-
 .../tuning/train_stats_asr_sad_1a.sh          |  18 ++-
 .../s5/steps/cleanup/lattice_oracle_align.sh  |   2 +-
 .../segmentation/combine_targets_dirs.sh      |  55 +++++++
 .../s5/steps/segmentation/copy_targets_dir.sh |  46 ++++++
 .../segmentation/detect_speech_activity.sh    |   9 +-
 8 files changed, 292 insertions(+), 54 deletions(-)
 create mode 100755 egs/swbd/s5c/local/run_cleanup_segmentation.sh
 create mode 100755 egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh
 create mode 100755 egs/wsj/s5/steps/segmentation/copy_targets_dir.sh

diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh
index 32b2e3a8411..6d935616225 100755
--- a/egs/swbd/s5c/local/run_asr_segmentation.sh
+++ b/egs/swbd/s5c/local/run_asr_segmentation.sh
@@ -1,15 +1,16 @@
-#! /bin/bash
+#!/bin/bash
 
-# Copyright 2017  Vimal Manohar
+# Copyright  2017  Nagendra Kumar Goel
+#            2017  Vimal Manohar
 # Apache 2.0
 
-# Features configs (Must match the features used to train the models
-# $sat_model_dir and $model_dir)
+# We assume the run.sh has been executed (because we are using model
+# directories like exp/tri4)
 
-lang=data/lang_nosp   # Must match the one used to train the models
+lang=data/lang   # Must match the one used to train the models
 lang_test=data/lang_nosp_sw1_tg  # Lang directory for decoding.
 
-data_dir=data/train_100k_nodup
+data_dir=data/train_nodup
 # Model directory used to align the $data_dir to get target labels for training
 # SAD. This should typically be a speaker-adapted system.
 sat_model_dir=exp/tri4
@@ -17,15 +18,8 @@ sat_model_dir=exp/tri4
 # get target labels for training SAD. This should typically be a 
 # speaker-independent system like LDA+MLLT system.
 model_dir=exp/tri3
-graph_dir=    # If not provided, a new one will be created using $lang_test
-
-# Uniform segmentation options for decoding whole recordings. All values are in
-# seconds.
-max_segment_duration=10
-overlap_duration=2.5
-max_remaining_duration=5  # If the last remaining piece when splitting uniformly
-                          # is smaller than this duration, then the last piece 
-                          # is  merged with the previous.
+graph_dir=    # Graph for decoding whole-recording version of $data_dir.
+              # If not provided, a new one will be created using $lang_test
 
 # List of weights on labels obtained from alignment, 
 # labels obtained from decoding and default labels in out-of-segment regions
@@ -35,13 +29,13 @@ prepare_targets_stage=-10
 nstage=-10
 train_stage=-10
 test_stage=-10
-
-affix=_1a
+num_data_reps=2
+affix=_1a   # For segmentation
 stage=-1
 nj=80
 
-. ./path.sh
 . ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 
 set -e -u -o pipefail
 . utils/parse_options.sh 
@@ -55,17 +49,17 @@ mkdir -p $dir
 
 # See $lang/phones.txt and decide which should be garbage
 garbage_phones="lau spn"
-silence_phones="nsn SIL"
+silence_phones="sil"
 
 for p in $garbage_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/garbage_phones.txt
 
 for p in $silence_phones; do 
-  for affix in "" "_B" "_E" "_I" "_S"; do
-    echo "$p$affix"
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
   done
 done > $dir/silence_phones.txt
 
@@ -75,7 +69,12 @@ if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
   exit 1
 fi
 
+data_id=$(basename $data_dir)
 whole_data_dir=${data_dir}_whole
+targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3
+
+rvb_data_dir=${whole_data_dir}_rvb_hires
+rvb_targets_dir=${targets_dir}_rvb
 
 if [ $stage -le 0 ]; then
   utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
@@ -85,8 +84,10 @@ fi
 # Extract features for the whole data directory
 ###############################################################################
 if [ $stage -le 1 ]; then
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj --write-utt2num-frames true \
-    ${whole_data_dir} || exit 1
+  steps/make_mfcc.sh --nj 50 --cmd "$train_cmd"  --write-utt2num-frames true \
+    $whole_data_dir exp/make_mfcc/${data_id}_whole
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
+  utils/fix_data_dir.sh $whole_data_dir
 fi
 
 ###############################################################################
@@ -108,38 +109,108 @@ if [ $stage -le 3 ]; then
     --nj 80 --reco-nj 40 --lang-test $lang_test \
     --garbage-phones-list $dir/garbage_phones.txt \
     --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --graph-dir "$graph_dir" \
     $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
 fi
 
 if [ $stage -le 4 ]; then
-  utils/copy_data_dir.sh ${whole_data_dir} ${whole_data_dir}_hires_bp
-  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires_bp.conf --nj 40 \
-    ${whole_data_dir}_hires_bp
-  steps/compute_cmvn_stats.sh ${whole_data_dir}_hires_bp
+  # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+  if [ ! -f rirs_noises.zip ]; then
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  rvb_opts=()
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)
+
+  foreground_snrs="20:10:15:5:0"
+  background_snrs="20:10:15:5:0"
+  # corrupt the data to generate multi-condition data
+  # for data_dir in train dev test; do
+  python steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 0.5 \
+    --pointsource-noise-addition-probability 0.5 \
+    --isotropic-noise-addition-probability 0.7 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 4 \
+    --source-sampling-rate 8000 \
+    $whole_data_dir $rvb_data_dir
 fi
 
 if [ $stage -le 5 ]; then
-  # Train a TDNN-LSTM network for SAD
-  local/segmentation/tuning/train_lstm_asr_sad_1a.sh \
-    --stage $nstage --train-stage $train_stage \
-    --targets-dir $dir \
-    --data-dir ${whole_data_dir}_hires_bp
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj 80 \
+    ${rvb_data_dir}
+  steps/compute_cmvn_stats.sh ${rvb_data_dir}
+  utils/fix_data_dir.sh $rvb_data_dir
 fi
 
 if [ $stage -le 6 ]; then
+  rvb_targets_dirs=()
+  for i in `seq 1 $num_data_reps`; do
+    steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
+      $targets_dir ${targets_dir}_temp_$i || exit 1
+    rvb_targets_dirs+=(${targets_dir}_temp_$i)
+  done
+
+  steps/segmentation/combine_targets_dirs.sh \
+    $rvb_data_dir ${rvb_targets_dir} \
+    ${rvb_targets_dirs[@]} || exit 1;
+
+  rm -r ${rvb_targets_dirs[@]}
+fi
+
+if [ $stage -le 7 ]; then
+  # Train a STATS-pooling network for SAD
+  local/segmentation/tuning/train_stats_asr_sad_1a.sh \
+    --stage $nstage --train-stage $train_stage \
+    --targets-dir ${rvb_targets_dir} \
+    --data-dir ${rvb_data_dir} --affix "1a" || exit 1
+fi
+
+if [ $stage -le 8 ]; then
   # The options to this script must match the options used in the 
   # nnet training script. 
-  # e.g. extra-left-context is 70, because the model is an LSTM trained with a 
-  # chunk-left-context of 60. 
+  # e.g. extra-left-context is 79, because the model is an stats pooling network 
+  # trained with a chunk-left-context of 79 and chunk-right-context of 21. 
   # Note: frames-per-chunk is 150 even though the model was trained with 
   # chunk-width of 20. This is just for speed.
   # See the script for details of the options.
   steps/segmentation/detect_speech_activity.sh \
-    --extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 \
+    --extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 \
     --extra-left-context-initial 0 --extra-right-context-final 0 \
     --nj 32 --acwt 0.3 --stage $test_stage \
     data/eval2000 \
-    exp/segmentation_1a/tdnn_lstm_asr_sad_1a \
-    mfcc_hires_bp \
-    exp/segmentation_1a/tdnn_lstm_asr_sad_1a/{,eval2000}
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a \
+    mfcc_hires \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/{,eval2000}
+fi
+
+if [ $stage -le 9 ]; then
+  # Do some diagnostics
+  steps/segmentation/evaluate_segmentation.pl data/eval2000/segments \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments &> \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/evalutate_segmentation.log
+  
+  steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/utt2spk \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/segments \
+    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm
+
+#  export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin
+#  md-eval.pl -c 0.25 -r $eval2000_rttm_file \
+#    -s exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/sys.rttm > \
+#    exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg/md_eval.log
+fi
+
+if [ $stage -le 10 ]; then
+  utils/copy_data_dir.sh exp/segmentation${affix}/tdnn_stats_asr_sad_1a/eval2000_seg \
+    data/eval2000.seg_asr_sad_1a
 fi
diff --git a/egs/swbd/s5c/local/run_cleanup_segmentation.sh b/egs/swbd/s5c/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..c879a55d16a
--- /dev/null
+++ b/egs/swbd/s5c/local/run_cleanup_segmentation.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# Copyright   2016  Vimal Manohar
+#             2016  Johns Hopkins University (author: Daniel Povey)
+#             2017  Nagendra Kumar Goel
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+# For nnet3 and chain results after cleanup, see the scripts in
+# local/nnet3/run_tdnn.sh and local/chain/run_tdnn.sh
+
+# GMM Results for speaker-independent (SI) and speaker adaptive training (SAT) systems on dev and test sets
+# [will add these later].
+
+set -e
+set -o pipefail
+set -u
+
+stage=0
+cleanup_stage=0
+data=data/train_nodup
+cleanup_affix=cleaned
+srcdir=exp/tri4
+langdir=data/lang_sw1_tg
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage \
+    --nj $nj --cmd "$train_cmd" \
+    $data $langdir $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data $langdir $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    11500 200000 $cleaned_data $langdir ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
index 63f78aa8092..13318756e43 100755
--- a/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
+++ b/egs/swbd/s5c/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
@@ -1,13 +1,14 @@
 #!/bin/bash
 
+# Copyright 2017 Nagendra Kumar Goel
+# Apache 2.0
+
 # This is a script to train a TDNN-LSTM for speech activity detection (SAD) 
 # using LSTM for long-context information.
 
 set -o pipefail
 set -u
 
-. ./cmd.sh
-
 # At this script level we don't support not running on GPU, as it would be painfully slow.
 # If you want to run without GPU you'd have to call train_tdnn.sh with --gpu false,
 # --num-threads 16 and --minibatch-size 128.
@@ -48,7 +49,7 @@ data_dir=exp/segmentation_1a/train_whole_hires_bp
 targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
 
 . ./cmd.sh
-. ./path.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./utils/parse_options.sh
 
 if [ -z "$dir" ]; then
diff --git a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh
index 2dfe9a0bb96..05e5f4ded05 100755
--- a/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh
+++ b/egs/swbd/s5c/local/segmentation/tuning/train_stats_asr_sad_1a.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Copyright 2017   Nagendra Kumar Goel
+#           2016   Vimal Manohar
+# Apache 2.0
 # This is a script to train a TDNN for speech activity detection (SAD) 
 # using statistics pooling for long-context information.
 
@@ -27,7 +30,7 @@ extra_right_context=21
 relu_dim=256
 
 # training options
-num_epochs=4
+num_epochs=1
 initial_effective_lrate=0.0003
 final_effective_lrate=0.00003
 num_jobs_initial=3
@@ -43,11 +46,11 @@ config_dir=
 dir=
 affix=1a2
 
-data_dir=exp/segmentation_1a/train_whole_hires_bp
+data_dir=exp/segmentation_1a/train_whole_rvb_hires
 targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
 
 . ./cmd.sh
-. ./path.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 . ./utils/parse_options.sh
 
 if [ -z "$dir" ]; then
@@ -129,10 +132,13 @@ if [ $stage -le 6 ]; then
     --targets-scp="$targets_dir/targets.scp" \
     --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
     --dir=$dir || exit 1
+fi
 
-  copy-feats scp:$targets_dir/targets.scp ark:- | \
-    matrix-sum-rows ark:- ark:- | vector-sum --binary=false ark:- - | \
-    awk '{print " [ "$2" "$3" ]"}' > $dir/post_output.vec
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
 
   echo 3 > $dir/frame_subsampling_factor
 fi
diff --git a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
index 4a9d43a51b5..d47daac1bc0 100755
--- a/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
+++ b/egs/wsj/s5/steps/cleanup/lattice_oracle_align.sh
@@ -74,7 +74,7 @@ oov=$(cat $lang/oov.int)
 
 utils/split_data.sh $data $nj
 
-sdata=$data/split$nj;
+sdata=$data/split${nj}
 
 if [ $stage -le 1 ]; then
   $cmd JOB=1:$nj $dir/log/get_oracle.JOB.log \
diff --git a/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh
new file mode 100755
index 00000000000..8135d089f5b
--- /dev/null
+++ b/egs/wsj/s5/steps/segmentation/combine_targets_dirs.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Copyright 2017 Nagendra Kumar Goel
+#           2018 Vimal Manohar   
+# Apache 2.0.
+
+# This script combines targets directory into a new targets directory 
+# containing targets from all the input targets directories.
+
+echo "$0 $@"  # Print the command line for logging
+
+if [ -f path.sh ]; then . ./path.sh; fi
+. parse_options.sh || exit 1;
+
+if [ $# -lt 3 ]; then
+  echo "Usage: $0 [options] <data> <dest-targets-dir> <src-targets-dir1> <src-targets-dir2> ..."
+  echo "e.g.: $0 data/train exp/targets_combined exp/targets_1 exp/targets_2"
+  exit 1;
+fi
+
+export LC_ALL=C
+
+data=$1;
+shift;
+dest=$1;
+shift;
+first_src=$1;
+
+mkdir -p $dest;
+rm -f $dest/{targets.*.ark,frame_subsampling_factor} 2>/dev/null
+
+frame_subsampling_factor=1
+if [ -f $first_src/frame_subsampling_factor ]; then
+  cp $first_src/frame_subsampling_factor $dest
+  frame_subsampling_factor=$(cat $dest/frame_subsampling_factor)
+fi
+
+for d in $*; do
+  this_frame_subsampling_factor=1
+  if [ -f $d/frame_subsampling_factor ]; then
+    this_frame_subsampling_factor=$(cat $d/frame_subsampling_factor)
+  fi
+
+  if [ $this_frame_subsampling_factor != $frame_subsampling_factor ]; then
+    echo "$0: Cannot combine targets directories with different frame-subsampling-factors" 1>&2
+    exit 1
+  fi
+
+  cat $d/targets.scp
+done | sort -k1,1 > $dest/targets.scp || exit 1
+
+steps/segmentation/validate_targets_dir.sh $dest $data || exit 1
+
+echo "Combined targets and stored in $dest"
+exit 0
diff --git a/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh
new file mode 100755
index 00000000000..f15206b1f7d
--- /dev/null
+++ b/egs/wsj/s5/steps/segmentation/copy_targets_dir.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright    2017  Nagendra Kumar Goel
+#              2014  Johns Hopkins University (author: Nagendra K Goel)
+# Apache 2.0
+
+# This script makes a copy of targets directory (by copying targets.scp),
+# possibly adding a specified prefix or a suffix to the utterance names.
+
+# begin configuration section
+utt_prefix=
+utt_suffix=
+# end configuration section
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <srcdir> <destdir>"
+  echo "e.g.:"
+  echo " $0  --utt-prefix=1- exp/segmentation_1a/train_whole_combined_targets_sub3 exp/segmentation_1a/train_whole_combined_targets_sub3_rev1"
+  echo "Options"
+  echo "   --utt-prefix=<prefix>     # Prefix for utterance ids, default empty"
+  echo "   --utt-suffix=<suffix>     # Suffix for utterance ids, default empty"
+  exit 1;
+fi
+
+export LC_ALL=C
+
+srcdir=$1
+destdir=$2
+
+mkdir -p $destdir
+
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  cp $srcdir/frame_subsampling_factor $destdir
+fi
+
+cat $srcdir/targets.scp | awk -v p=$utt_prefix -v s=$utt_suffix \
+  '{printf("%s %s%s%s\n", $1, p, $1, s);}' > $destdir/utt_map
+
+cat $srcdir/targets.scp | utils/apply_map.pl -f 1 $destdir/utt_map | \
+  sort -k1,1 > $destdir/targets.scp
+
+echo "$0: copied targets from $srcdir to $destdir"
diff --git a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
index 69f47c28d60..60e3df20df2 100755
--- a/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
+++ b/egs/wsj/s5/steps/segmentation/detect_speech_activity.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 # Copyright 2016-17  Vimal Manohar
+#              2017  Nagendra Kumar Goel
 # Apache 2.0.
 
 # This script does nnet3-based speech activity detection given an input 
@@ -12,7 +13,7 @@ set -e
 set -o pipefail
 set -u
 
-. ./path.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
 
 affix=  # Affix for the segmentation
 nj=32
@@ -20,8 +21,8 @@ cmd=queue.pl
 stage=-1
 
 # Feature options (Must match training)
-mfcc_config=conf/mfcc_hires_bp.conf
-feat_affix=bp   # Affix for the type of feature used
+mfcc_config=conf/mfcc_hires.conf
+feat_affix=   # Affix for the type of feature used
 
 convert_data_dir_to_whole=true    # If true, the input data directory is 
                                   # first converted to whole data directory (i.e. whole recordings)
@@ -67,7 +68,7 @@ if [ $# -ne 5 ]; then
   echo "See script for details of the options to be supplied."
   echo "Usage: $0 <src-data-dir> <sad-nnet-dir> <mfcc-dir> <work-dir> <out-data-dir>"
   echo " e.g.: $0 ~/workspace/egs/ami/s5b/data/sdm1/dev exp/nnet3_sad_snr/nnet_tdnn_j_n4 \\"
-  echo "    mfcc_hires_bp exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
+  echo "    mfcc_hires exp/segmentation_sad_snr/nnet_tdnn_j_n4 data/ami_sdm1_dev"
   echo ""
   echo "Options: "
   echo "  --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."

From ca5f128f23b0a6271bb8bd98aaf26d5a50f55463 Mon Sep 17 00:00:00 2001
From: david-ryan-snyder <david.ryan.snyder@gmail.com>
Date: Sun, 11 Feb 2018 22:54:46 -0500
Subject: [PATCH 123/184] [doc] Updating contributor list in COPYING to contain
 the current top 66 contributors in Kaldi. Thanks to Yishay Carmiel for
 gathering this info (#2214)

---
 COPYING | 56 +++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/COPYING b/COPYING
index 28520d1ec0b..d8804be572c 100644
--- a/COPYING
+++ b/COPYING
@@ -57,30 +57,72 @@ License v 2.0 are set forth below.
 
 Individual Contributors (in alphabetical order)
       
-      Mohit Agarwal      
+      Mohit Agarwal
+      Tanel Alumae
       Gilles Boulianne
       Lukas Burget
+      Dogan Can
+      Guoguo Chen
+      Gaofeng Cheng
       Cisco Corporation
-      Ondrej Glembek
+      Pavel Denisov
+      Ilya Edrenkin
+      Ewald Enzinger
+      Joachim Fainberg
+      Daniel Galvez
+      Pegah Ghahremani
       Arnab Ghoshal
+      Ondrej Glembek
       Go Vivace Inc.
+      Allen Guo
+      Hossein Hadian
+      Lv Hang
       Mirko Hannemann
+      Hendy Irawan
       Navdeep Jaitly
       Johns Hopkins University
+      Shiyin Kang
+      Kirill Katsnelson
+      Tom Ko
+      Danijel Korzinek
+      Gaurav Kumar
+      Ke Li
+      Matthew Maciejewski
+      Vimal Manohar
       Yajie Miao
       Microsoft Corporation
       Petr Motlicek
+      Xingyu Na
+      Vincent Nguyen
+      Lucas Ondel
       Vassil Panayotov
+      Vijayaditya Peddinti
+      Phonexia s.r.o.
+      Ondrej Platek
+      Daniel Povey
+      Yanmin Qian
       Ariya Rastrow
       Saarland University
-      Petr Schwarz      
-      Georg Stemmer
+      Omid Sadjadi
+      Petr Schwarz
+      Yiwen Shao
+      Nickolay V. Shmyrev
       Jan Silovsky
-      Phonexia s.r.o.
-      Yanmin Qian
-      Lucas Ondel
+      Eduardo Silva
+      Peter Smit
+      David Snyder
+      Alexander Solovets
+      Georg Stemmer
+      Pawel Swietojanski
+      Jan "Yenda" Trmal
+      Albert Vernon
       Karel Vesely
+      Yiming Wang
+      Shinji Watanabe
+      Minhua Wu
       Haihua Xu
+      Hainan Xu
+      Xiaohui Zhang
       
 Other Source Material
 

From 6f0b2016bf490cda1a5dbb39bdd8225eba3d91b7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 12 Feb 2018 02:00:59 -0500
Subject: [PATCH 124/184] [scripts,egs] Add dropout per dim; adding example for
 TDNNs on Swbd.

---
 .../s5c/local/chain/tuning/run_tdnn_7m25d.sh  | 571 +++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m25e.sh  | 556 +++++++++++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m25f.sh  | 575 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  33 +-
 4 files changed, 1730 insertions(+), 5 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
new file mode 100755
index 00000000000..ec0f1f4dc87
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
@@ -0,0 +1,571 @@
+#!/bin/bash
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+# Maybe slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25a_sp tdnn7m25b_sp tdnn7m25c_sp tdnn7m25d_sp
+# System                tdnn7m23t_sp tdnn7m25a_sp tdnn7m25b_sp tdnn7m25c_sp tdnn7m25d_sp
+# WER on train_dev(tg)      12.18     11.99     12.09     12.06     12.10
+# WER on train_dev(fg)      11.12     11.20     11.12     11.06     11.10
+# WER on eval2000(tg)        14.9      15.3      15.2      14.9      14.9
+# WER on eval2000(fg)        13.5      13.8      13.6      13.5      13.5
+# WER on rt03(tg)            18.4      18.4      18.3      18.5      18.2
+# WER on rt03(fg)            16.2      16.1      15.9      16.2      16.0
+# Final train prob         -0.077    -0.077    -0.078    -0.077    -0.077
+# Final valid prob         -0.093    -0.093    -0.094    -0.092    -0.092
+# Final train prob (xent)        -0.994    -0.990    -0.996    -0.977    -0.986
+# Final valid prob (xent)       -1.0194   -1.0286   -1.0198   -1.0114   -1.0111
+# Num-parameters               20111396  20439076  20439076  23128356  22735140
+
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25d
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
new file mode 100755
index 00000000000..3247de70eaa
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
@@ -0,0 +1,556 @@
+#!/bin/bash
+
+# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25e
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh
new file mode 100755
index 00000000000..669c55a7f56
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh
@@ -0,0 +1,575 @@
+#!/bin/bash
+
+# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments.
+#  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25e_sp tdnn7m25f_sp
+# System                tdnn7m23t_sp tdnn7m25e_sp tdnn7m25f_sp
+# WER on train_dev(tg)      12.18     12.27     12.44
+# WER on train_dev(fg)      11.12     11.24     11.22
+# WER on eval2000(tg)        14.9      15.2      15.0
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.4      18.4      18.0
+# WER on rt03(fg)            16.2      16.1      15.6
+# Final train prob         -0.077    -0.078    -0.084
+# Final valid prob         -0.093    -0.092    -0.097
+# Final train prob (xent)        -0.994    -1.006    -1.050
+# Final valid prob (xent)       -1.0194   -1.0270   -1.0592
+# Num-parameters               20111396  20111396  20111396
+#
+# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25f
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-dropout-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-dropout-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 47a5dfdf082..a3dfa89cf0e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -686,7 +686,9 @@ def set_default_configs(self):
                        'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
                                                    # affects layers with
-                                                   # 'dropout' in the name.
+                                                   # 'dropout' in the name
+                       'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
+                                                  # mask is shared across time.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -862,10 +864,31 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                line = ('component name={0}.{1} type=DropoutComponent '
-                        'dim={2} dropout-proportion={3}'.format(
-                            self.name, nonlinearity, output_dim,
-                            self.config['dropout-proportion']))
+                if not self.config['dropout-per-dim']:
+                    line = ('component name={0}.{1} type=DropoutComponent '
+                            'dim={2} dropout-proportion={3}'.format(
+                                self.name, nonlinearity, output_dim,
+                                self.config['dropout-proportion']))
+                else:
+                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
+                            'output-dim={1} dropout-proportion={2}'.format(
+                                self.name, output_dim, self.config['dropout-proportion']))
+                    configs.append(line)
+                    # note: the input to the dropout_mask component is never used, it's
+                    # just syntactically required.
+                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
+                            'input={1}'.format(self.name, cur_node))
+                    configs.append(line)
+                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
+                            'input-dim={1} output-dim={2} '.format(
+                                self.name, 2 * output_dim, output_dim))
+                    configs.append(line)
+                    line = ('component-node name={0}.dropout component={0}.dropout '
+                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
+                            ''.format(self.name, cur_node))
+                    configs.append(line)
+                    cur_node = '{0}.dropout'.format(self.name)
+                    continue
 
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"

From 048aa05c37ade99357db8c714effa67e6a7f7516 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 12 Feb 2018 02:01:34 -0500
Subject: [PATCH 125/184] [src] Clean up cu-device.cc

---
 src/cudamatrix/cu-device.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 9b0976b05ad..65cb6a120ff 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -58,6 +58,12 @@ namespace kaldi {
 */
 
 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
+
+  // Our first attempt is to just do cudaFree(0) and see if that
+  // returns no error code.
+  if (cudaFree(0) == 0)
+    return true;
+
   std::ostringstream debug_stream;
   debug_stream << "num-gpus=" << num_gpus << ". ";
   for (int32 device = 0; device < num_gpus; device++) {
@@ -220,9 +226,9 @@ void CuDevice::FinalizeActiveGpu() {
     }
     // Remember the id of active GPU
     active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on
-    // Initialize the CUBLAS
+    // Initialize CUBLAS.
     CUBLAS_SAFE_CALL(cublasCreate(&handle_));
-    // Initialize the cuSPARSE
+    // Initialize the cuSPARSE library
     CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_));
 
     // Notify user which GPU is finally used

From 301850b13080050149ae29ab118b1955fad31bfe Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 12 Feb 2018 20:12:33 -0500
Subject: [PATCH 126/184] [egs] Add initial TDNN+LSTM number with
 bottlenecks/factorization

---
 .../local/chain/tuning/run_tdnn_lstm_1m13.sh  | 282 ++++++++++++++++++
 1 file changed, 282 insertions(+)
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh
new file mode 100755
index 00000000000..fe80244b26d
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh
@@ -0,0 +1,282 @@
+#!/bin/bash
+
+
+# 1m13 is as 1m but with significant changes, replacing TDNN layers with a
+# structure like run_tdnn_7m23t.sh.  Seems better!
+#
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1m13_sp tdnn_lstm1m13_sp_online
+# System                tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1m13_sp tdnn_lstm1m13_sp_online
+# WER on train_dev(tg)      12.33     12.21     12.38     12.49
+# WER on train_dev(fg)      11.42     11.41     11.48     11.59
+# WER on eval2000(tg)        15.2      15.1      15.0      14.9
+# WER on eval2000(fg)        13.8      13.8      13.5      13.5
+# WER on rt03(tg)            18.6      18.4      18.0      18.0
+# WER on rt03(fg)            16.3      16.1      15.8      15.8
+# Final train prob         -0.082     0.000    -0.084     0.000
+# Final valid prob         -0.099     0.000    -0.104     0.000
+# Final train prob (xent)        -0.959     0.000    -1.154     0.000
+# Final valid prob (xent)       -1.0305    0.0000   -1.2190    0.0000
+# Num-parameters               39558436         0  27773348         0
+#
+
+# exp/chain/tdnn_lstm_1m_ld5_sp: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6050 combine=-0.091->-0.088 xent:train/valid[173,261,final]=(-1.33,-0.947,-0.950/-1.39,-1.03,-1.03) logprob:train/valid[173,261,final]=(-0.112,-0.080,-0.081/-0.127,-0.100,-0.100)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=1m13
+decode_iter=
+decode_dir_affix=
+decode_nj=50
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+# training options
+frames_per_chunk=140,100,160
+frames_per_chunk_primary=$(echo $frames_per_chunk | cut -d, -f1)
+chunk_left_context=40
+chunk_right_context=0
+xent_regularize=0.025
+self_repair_scale=0.00001
+label_delay=5
+# decode options
+extra_left_context=50
+extra_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+
+remove_egs=true
+common_egs_dir=
+
+test_online_decoding=true  # if true, it will run the last decoding stage.
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=exp/chain/tdnn_lstm${affix}${suffix}
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  lstm_opts="l2-regularize=0.0005 decay-time=40"
+  output_opts="l2-regularize=0.0005 output-delay=$label_delay max-change=1.5 dim=$num_targets"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn4l, tdnn2l)
+  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  output-layer name=output input=lstm3  include-log-softmax=false $output_opts
+
+  output-layer name=output-xent input=lstm3 learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+      /export/c0{1,2,5,7}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_chunk \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 15 ]; then
+  iter_opts=
+  if [ ! -z $decode_iter ]; then
+    iter_opts=" --iter $decode_iter "
+  fi
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj 50 --cmd "$decode_cmd" $iter_opts \
+          --extra-left-context $extra_left_context \
+          --extra-right-context $extra_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk "$frames_per_chunk_primary" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires \
+         $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_dir_affix:+_$decode_dir_affix}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in online decoding"
+    exit 1
+  fi
+fi
+
+exit 0;

From 5ea9b0da3ff3f5b3b762366be76b2cc30e260887 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Tue, 13 Feb 2018 18:41:50 -0500
Subject: [PATCH 127/184] [egs,scripts] HUB4 English Broadcast News recipe
 (#2027)

---
 egs/hub4_english/s5/README                    |  33 +
 egs/hub4_english/s5/RESULTS                   |   9 +
 egs/hub4_english/s5/cmd.sh                    |  14 +
 egs/hub4_english/s5/conf/mfcc.conf            |   1 +
 egs/hub4_english/s5/conf/vad.conf             |   2 +
 .../s5/local/data_prep/csr4_utils.patch       | 793 ++++++++++++++++++
 .../s5/local/data_prep/format_1996_bn_data.pl | 131 +++
 .../s5/local/data_prep/format_1997_bn_data.pl |   1 +
 .../s5/local/data_prep/hub4_utils.py          | 156 ++++
 .../data_prep/normalize_bn96_transcripts.pl   |  28 +
 .../data_prep/normalize_bn97_transcripts.pl   |  36 +
 .../data_prep/parse_sgm_1996_hub4_eng.pl      | 229 +++++
 .../data_prep/parse_sgm_1997_hub4_eng.pl      | 228 +++++
 .../data_prep/prepare_1995_csr_hub4_corpus.sh |  63 ++
 .../local/data_prep/prepare_1996_bn_data.sh   |  44 +
 .../prepare_1996_csr_hub4_lm_corpus.sh        |  59 ++
 .../prepare_1996_hub4_bn_eng_dev_and_eval.sh  |  99 +++
 .../local/data_prep/prepare_1997_bn_data.sh   |  44 +
 .../prepare_1997_hub4_bn_eng_eval.sh          |  65 ++
 .../prepare_1998_hub4_bn_eng_eval.sh          |  65 ++
 .../prepare_1999_hub4_bn_eng_eval.sh          |  72 ++
 .../data_prep/prepare_na_news_text_corpus.sh  |  63 ++
 .../prepare_na_news_text_supplement.sh        |  72 ++
 .../data_prep/process_1995_bn_annotation.py   | 273 ++++++
 .../process_1996_csr_hub4_lm_filelist.py      | 165 ++++
 .../local/data_prep/process_na_news_text.py   | 151 ++++
 egs/hub4_english/s5/local/dict                |   1 +
 egs/hub4_english/s5/local/format_data.sh      | 133 +++
 egs/hub4_english/s5/local/format_lms.sh       |  47 ++
 .../s5/local/lm/merge_word_counts.py          |  30 +
 .../s5/local/normalize_transcripts.pl         |   1 +
 egs/hub4_english/s5/local/prepare_dict.sh     | 146 ++++
 .../s5/local/run_cleanup_segmentation.sh      |  96 +++
 .../s5/local/run_segmentation_wsj.sh          | 313 +++++++
 egs/hub4_english/s5/local/score.sh            |   1 +
 egs/hub4_english/s5/local/score_sclite.sh     | 111 +++
 egs/hub4_english/s5/local/train_lm.sh         | 234 ++++++
 egs/hub4_english/s5/path.sh                   |   7 +
 egs/hub4_english/s5/run.sh                    | 269 ++++++
 egs/hub4_english/s5/steps                     |   1 +
 egs/hub4_english/s5/utils                     |   1 +
 .../steps/cleanup/internal/align_ctm_ref.py   |  12 +-
 egs/wsj/s5/steps/cleanup/internal/get_ctm.sh  |  78 --
 .../steps/cleanup/segment_long_utterances.sh  | 129 ++-
 egs/wsj/s5/steps/dict/train_g2p.sh            |   5 +-
 egs/wsj/s5/steps/get_ctm_fast.sh              |  81 ++
 .../utils/data/convert_data_dir_to_whole.sh   |  94 +--
 egs/wsj/s5/utils/data/get_utt2dur.sh          |  18 +-
 .../internal/combine_segments_to_recording.py |  66 ++
 49 files changed, 4544 insertions(+), 226 deletions(-)
 create mode 100644 egs/hub4_english/s5/README
 create mode 100644 egs/hub4_english/s5/RESULTS
 create mode 100755 egs/hub4_english/s5/cmd.sh
 create mode 100644 egs/hub4_english/s5/conf/mfcc.conf
 create mode 100644 egs/hub4_english/s5/conf/vad.conf
 create mode 100644 egs/hub4_english/s5/local/data_prep/csr4_utils.patch
 create mode 100755 egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
 create mode 120000 egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
 create mode 100644 egs/hub4_english/s5/local/data_prep/hub4_utils.py
 create mode 100755 egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
 create mode 100755 egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
 create mode 100755 egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
 create mode 100755 egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
 create mode 100755 egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
 create mode 100755 egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
 create mode 100755 egs/hub4_english/s5/local/data_prep/process_na_news_text.py
 create mode 120000 egs/hub4_english/s5/local/dict
 create mode 100755 egs/hub4_english/s5/local/format_data.sh
 create mode 100755 egs/hub4_english/s5/local/format_lms.sh
 create mode 100755 egs/hub4_english/s5/local/lm/merge_word_counts.py
 create mode 120000 egs/hub4_english/s5/local/normalize_transcripts.pl
 create mode 100755 egs/hub4_english/s5/local/prepare_dict.sh
 create mode 100755 egs/hub4_english/s5/local/run_cleanup_segmentation.sh
 create mode 100755 egs/hub4_english/s5/local/run_segmentation_wsj.sh
 create mode 120000 egs/hub4_english/s5/local/score.sh
 create mode 100755 egs/hub4_english/s5/local/score_sclite.sh
 create mode 100755 egs/hub4_english/s5/local/train_lm.sh
 create mode 100755 egs/hub4_english/s5/path.sh
 create mode 100755 egs/hub4_english/s5/run.sh
 create mode 120000 egs/hub4_english/s5/steps
 create mode 120000 egs/hub4_english/s5/utils
 delete mode 100755 egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
 create mode 100755 egs/wsj/s5/steps/get_ctm_fast.sh
 create mode 100755 egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py

diff --git a/egs/hub4_english/s5/README b/egs/hub4_english/s5/README
new file mode 100644
index 00000000000..7db319fe174
--- /dev/null
+++ b/egs/hub4_english/s5/README
@@ -0,0 +1,33 @@
+This is the English Broadcast News (HUB4) corpus.
+
+1996 English Broadcast News Train (HUB4)
+   Speech      LDC97S44
+   Transcripts LDC97T22
+
+1997 English Broadcast News Train (HUB4)
+  Speech       LDC98S71
+  Transcripts  LDC98T28
+
+1995 English Broadcast News (CSR-IV HUB4)
+  LDC96S31
+
+North American News Text Corpus
+  LDC95T21
+
+North American News Text Supplement Corpus
+  LDC98T30
+
+1996 CSR HUB4 Language Model
+  LDC98T31
+
+1996 English Broadcast News Dev and Eval (HUB4) 
+  LDC97S66
+
+1997 HUB4 English Evaluation corpus
+  LDC2002S11
+ 
+1998 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S86 
+
+1999 HUB4 Broadcast News Evaluation English Test Material
+  LDC2000S88
diff --git a/egs/hub4_english/s5/RESULTS b/egs/hub4_english/s5/RESULTS
new file mode 100644
index 00000000000..c6c719f51fb
--- /dev/null
+++ b/egs/hub4_english/s5/RESULTS
@@ -0,0 +1,9 @@
+for x in exp/*/decode*; do grep Sum $x/score*/*.ctm.*sys | utils/best_wer.sh ; done | sort -k2,2n
+exit 0
+
+%WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+%WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 19.4 | 728 32834 | 82.7 13.1 4.2 2.1 19.4 83.8 | exp/tri3/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 20.5 | 728 32834 | 81.7 13.9 4.4 2.3 20.5 85.0 | exp/tri3/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+%WER 23.7 | 728 32834 | 79.0 16.0 5.0 2.7 23.7 85.3 | exp/tri4/decode_nosp_eval97.pem.si/score_12_0.0/eval97.pem.ctm.filt.sys
+%WER 25.7 | 728 32834 | 77.1 17.6 5.3 2.8 25.7 85.9 | exp/tri3/decode_nosp_eval97.pem.si/score_13_0.0/eval97.pem.ctm.filt.sys
diff --git a/egs/hub4_english/s5/cmd.sh b/egs/hub4_english/s5/cmd.sh
new file mode 100755
index 00000000000..43f7b21771a
--- /dev/null
+++ b/egs/hub4_english/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/hub4_english/s5/conf/mfcc.conf b/egs/hub4_english/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..7361509099f
--- /dev/null
+++ b/egs/hub4_english/s5/conf/mfcc.conf
@@ -0,0 +1 @@
+--use-energy=false   # only non-default option.
diff --git a/egs/hub4_english/s5/conf/vad.conf b/egs/hub4_english/s5/conf/vad.conf
new file mode 100644
index 00000000000..a0ca2449b10
--- /dev/null
+++ b/egs/hub4_english/s5/conf/vad.conf
@@ -0,0 +1,2 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
diff --git a/egs/hub4_english/s5/local/data_prep/csr4_utils.patch b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
new file mode 100644
index 00000000000..1b7dcb4ec1b
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/csr4_utils.patch
@@ -0,0 +1,793 @@
+diff -Naur tools/csr4_utils/abbrproc.perl local/data_prep/csr_hub4_utils/abbrproc.perl
+--- tools/csr4_utils/abbrproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/abbrproc.perl	2017-11-03 13:22:09.466213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: abbrproc.perl,v 1.3 1996/08/21 20:05:09 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/artfilter.perl local/data_prep/csr_hub4_utils/artfilter.perl
+--- tools/csr4_utils/artfilter.perl	1996-01-04 11:31:57.000000000 -0500
++++ local/data_prep/csr_hub4_utils/artfilter.perl	2017-11-03 13:22:09.470213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # artfilter.perl 
+ 
+diff -Naur tools/csr4_utils/bugproc.perl local/data_prep/csr_hub4_utils/bugproc.perl
+--- tools/csr4_utils/bugproc.perl	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/bugproc.perl	2017-11-03 13:22:09.474213159 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ # $Id: bugproc.perl,v 1.4 1996/08/21 23:55:40 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+diff -Naur tools/csr4_utils/do-lm local/data_prep/csr_hub4_utils/do-lm
+--- tools/csr4_utils/do-lm	1996-08-27 15:25:15.000000000 -0400
++++ local/data_prep/csr_hub4_utils/do-lm	2017-11-27 14:21:15.965400509 -0500
+@@ -22,19 +22,22 @@
+ 	exit 1
+ fi
+ 
+-PATH=$PATH:./bin ; export PATH
++dir=$1
++shift
+ 
+ for file in $*
+ do
+ 	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
+ 	echo "Running LM pipeline for |$BASENM|..." 1>&2
+ 	set -x
+-	pare-sgml.perl $file |
+-	 bugproc.perl |
+-	 numhack.perl |
+-	 numproc.perl |
+-	 abbrproc.perl |
+-	 puncproc.perl > lm/$BASENM
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
+ 	set +x
+ 	echo "Done with $BASENM."
+ done
+diff -Naur tools/csr4_utils/numhack.perl local/data_prep/csr_hub4_utils/numhack.perl
+--- tools/csr4_utils/numhack.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numhack.perl	2017-11-03 13:22:09.482213158 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: numhack.perl,v 1.4 1996/08/23 05:12:27 robertm Rel $
+ # preprocessor for numproc, potentially specialized for Broadcast News material
+diff -Naur tools/csr4_utils/numproc.perl local/data_prep/csr_hub4_utils/numproc.perl
+--- tools/csr4_utils/numproc.perl	1996-08-27 15:25:16.000000000 -0400
++++ local/data_prep/csr_hub4_utils/numproc.perl	2017-11-08 16:59:50.497562934 -0500
+@@ -1,4 +1,5 @@
+-#!/usr/local/bin/perl
++#! /usr/bin/perl
++#
+ # $Id: numproc.perl,v 1.7 1996/08/23 05:04:11 robertm Rel $
+ ###############################################################################
+ # This software is being provided to you, the LICENSEE, by the Massachusetts  #
+@@ -74,7 +75,7 @@
+ {	if($ARGV[$i] =~ /^-/)
+ 	{	if($ARGV[$i] =~ /^-v/) {$vflg=1;}
+ 		elsif($ARGV[$i] =~ /^-x/)
+-		{	$exfile=$ARGV[i];
++		{	$exfile=$ARGV[$i];
+ 			$exfile =~ s/^-x//;
+ 		}
+ 		else {&perr2("illegal flag: $ARGV[$i]");}
+@@ -237,7 +238,7 @@
+ 	if(/\d/ && !/^<\/?[spa]/)		# opt and protect sgml
+ 	{	@input = split(/\s+/o);
+ 		@output=();
+-	wloop:	for($field=0;$field<=$#input;$field++)	# $field is global
++	for($field=0;$field<=$#input;$field++)	# $field is global
+ 		{	if($field>0) {$last=$input[$field-1];}
+ 			else {$last='';}
+ 			if($field<$#input) {$next=$input[$field+1];}
+@@ -248,27 +249,27 @@
+ 			$_=$input[$field];
+ 	
+ 			if(/<[\w\.\/]*>/o && !/<p/o && !/<\/p>/o) # pass only
+-				{&perr("spurious SGML: $_");}	# <p... and </p>
++				{&perr("spurious SGML: $_"); next; }	# <p... and </p>
+ 	
+ 			if(/[0-9]/o && !/<p/o)		# number but not <p
+ 			{	if(/[\$\#]/o)			# money
+-					{&money($_,$next);}
++					{if (! &money($_,$next)) {next;} }
+ 				elsif(/\d:\d\d$/o || /\d:\d\d\D/o)	# time
+-					{&printtime($_);}
++					{if (! &printtime($_)) {next;} }
+ 				elsif(/\d+\/\d+\/\d+/o)		# x/x/x date
+-					{&printdate($_);}
++					{if (! &printdate($_)) {next;} }
+ 				elsif((/[a-zA-Z].*\d/ || /\d.*[a-zA-Z]/)
+ 				      && 
+ 				      !(/\dth\W*/i || /1st\W*/i || /2nd\W*/i
+ 					|| /3rd\W*/i
+ 					|| (/\d\'?s\W*/
+ 					    && (! /\d[a-zA-Z]+\d+\'?s\W*$/))))
+-					{&printserno($_);}	 # serial no
++					{if (! &printserno($_)) {next;} }	 # serial no
+ 				elsif(/\//o)			# fraction
+-					{&printfrac($_);}
++					{if (! &printfrac($_)) {next;} }
+ 				elsif(/\d\'-?\d+/o)		# ft inches
+-					{&printftin($_);}
+-				else {&printnum($_); }	      # ordinary number
++					{if (! &printftin($_)) {next;} }
++				else {if (! &printnum($_)) {next;} }	      # ordinary number
+ 			}
+ 			else {&pusho($_ );}		# non-numeric string
+ 		}
+@@ -348,7 +349,7 @@
+ 		$subunit_sing='penny';
+ 		$subunit_pl='pence';
+ 	}
+-	else {&perr("money: unknown currency");}
++	else {&perr("money: unknown currency"); return 0;}
+ 
+ 	($back)=/(\D*)$/;
+ 	$back =~ s/^s//;	# $40s -> $40
+@@ -362,32 +363,32 @@
+ 	if($x =~ /\//)
+ 	{	$x =~ s/^\D*//;
+ 		$x =~ s/\D*$//;
+-		&printfrac($x);
++		if (! &printfrac($x)) {return 0;}
+ 		&pusho("of a $unit");
+ 		$x="";
+ 		$plural=0;
+ 	}
+ 
+ 	$x =~ s/^\D*([\d,]*)\D*.*$/$1/;		# int part of string
+-	if($x ne "") {&printint($x);}		# print int part (eg. dollars)
++	if($x ne "") {if (! &printint($x)) {return 0;} }		# print int part (eg. dollars)
+ 
+ 	if($next eq "and" && $next2 =~ /\d\/\d/ && next2 !~ /\/.*\//)
+ 	{	if($unit && $x ne "") {&pusho("and");}	      # frac: eg 4 1/16
+ 		$z=$next2;
+ 		$z =~ s/\D*$//;
+-		&printfrac($z);
++		if (! &printfrac($z)) {return 0;}
+ 		($punct)=($next2 =~ /(\D*)$/);
+ 		$field+=2;
+ 		&pusho("${unit}s");
+ 	
+-		if($back) {&perr("money: back and 1 1/3");}
++		if($back) {&perr("money: back and 1 1/3"); return 0;}
+ 		
+ 		if($punct) {&appendo($punct);}	# punctuation from *illion
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($back eq "" && $next =~ /^(thousands?|[a-z]*illions?)(\W*)/i)
+-	{	&printdecfrac($_);			# multiplier
++	{	if (! &printdecfrac($_)) {return 0;}			# multiplier
+ 		&pusho($1);
+ 		$punct=$2;
+ 		$plural=1;			### if adj '', if noun 's'
+@@ -395,7 +396,7 @@
+ 		$frac=1;
+ 	}
+ 	elsif(/\.\d$/ || /\.\d\D/ || /\.\d{3}/ )	# .d or .ddd+
+-	{	&printdecfrac($_);
++	{	if (! &printdecfrac($_)) {return 0;}
+ 		$plural=1;			# can be either
+ 		$frac=1;
+ 	}
+@@ -409,7 +410,7 @@
+ 	{	$unit="";			# fix "$1 dollar" wsj typo
+ 		$subunit_sing="";
+ 		$subunit_pl="";
+-		&printdecfrac($_);
++		if (! &printdecfrac($_)) {return 0;}
+ 		$frac=1;
+ 	}
+ 
+@@ -447,24 +448,26 @@
+ 	{	$y=$_;
+ 		$y =~ s/^[^\.]*\.([\d]*)\D?.*$/$1/;	# get fractional part
+ 		if($unit && $x ne "") {&pusho("and");}
+-		&printint($y);
++		if (! &printint($y)) {return 0;}
+ 		if($sing || int($y)==1) {&pusho($subunit_sing);}
+ 		else {&pusho($subunit_pl);}
+ 	}
+ 
+ 	if($back)				# punctuation from this field
+-	{	if($punct) {&perr("money: back and punct");}
++	{	if($punct) {&perr("money: back and punct"); return 0;}
+ 
+ 		if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}
+ 	}
+ 		
+ 	if($punct) {&appendo($punct);}		# punctuation from *illion
++
++  return 1;
+ }
+ 
+ sub printyear			# &printyear(x)
+ {	if($vflg) {print "printyear: $_[0]\n";}
+-	&printnum($_[0]);		# for now
++	return &printnum($_[0]);		# for now
+ }
+ 
+ sub printtime			# &printtime(x)
+@@ -475,7 +478,7 @@
+ 	local($front);
+ 	local($back);
+ 
+-	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time");}
++	if(/:{2,}/ || !/\d:\d/) {&perr("printtime: not a time"); return 0;}
+ 
+ 	@x=split(/:/,$_);
+ 	($front)=($x[0] =~ /^(\D*)/);
+@@ -487,20 +490,21 @@
+ 	{	&pusho($front);			# generally punctuation
+ 		if($front !~ /\w$/) {$appendflg=1;}
+ 	}
+-	&printint($x[0]);
++	if (! &printint($x[0])) {return 0;}
+ 	if($x[1]==0)
+ 	{	$_=$next;
+ 		if(!/^[aApP]\.?[nM]\.?$/) {&pusho("o'clock");}
+ 	}
+ 	elsif ($x[1]<10)
+ 	{	&pusho("oh");
+-		&printint($x[1]);
++		if (!&printint($x[1])) {return 0;}
+ 	}
+-	else {&printint($x[1]);}
++	else {if (! &printint($x[1])) {return 0;} }
+ 	if($back)
+ 	{	if($back =~ /^\w/) {&pusho($back);}
+ 		else {&appendo($back);}		# generally punctuation
+ 	}
++  return 1;
+ }
+ 
+ sub printfrac
+@@ -530,8 +534,8 @@
+ 	}
+ 
+ 	@z=split(/\//,$x);
+-	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]");}
+-	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]");}
++	if($#z !=1) {&perr("printfrac: illegal fraction: $_[0]"); return 0;}
++	if($z[1] <= 1) {&perr("printfrac: den too small: $_[0]"); return 0;}
+ 
+ 	if($front) 
+ 	{	&pusho($front);
+@@ -541,22 +545,22 @@
+ 
+ 	if($sign) {&pusho($sign);}
+ 
+-	&printint($z[0]);			#numerator
++	if (! &printint($z[0])) { return 0;}			#numerator
+ 	if($z[1] <= $#den)			# small den from table (<20)
+ 	{	&pusho($den[$z[1]]);
+-		if($z[0]!=1) {&pluralize;}
++		if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 	}
+ 	else					#large den
+ 	{	$ones=int($z[1]%100);
+ 		$hun=100*int($z[1]/100);
+-		if($hun>0) {&printint($hun);}
++		if($hun>0) {if (!&printint($hun)) {return 0;} }
+ 		if($ones==0) 
+ 		{	&appendo("th");
+-			if($z[0]!=1) {&pluralize;}
++			if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 		}
+ 		elsif($ones<=$#largeden)		# <20
+ 		{	&pusho($largeden[$ones]);
+-			if($z[0]!=1) {&pluralize;};
++			if($z[0]!=1) {if (!&pluralize) {return 0;} }
+ 		}
+ 		else
+ 		{	$x=int($ones%10);
+@@ -569,11 +573,11 @@
+ 			}
+ 			if($x==0)
+ 			{	&pusho("th");
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 			else
+ 			{	&pusho($largeden[$x]);
+-				if($z[0]!=1) {&pluralize;}
++        if($z[0]!=1) {if (! &pluralize) {return 0;} }
+ 			}
+ 		}
+ 	}
+@@ -585,6 +589,8 @@
+ 			&appendo($back);
+ 		}
+ 	}
++  
++  return 1;
+ }
+ 
+ sub printnum			# printnum(n)
+@@ -624,7 +630,7 @@
+ 		$x =~ s/\D*$//;			# strip back: final . is punct
+ 	}
+ 
+-	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number");}
++	if($x =~ /[^\d\.,]/) {&perr("printnum: $_[0] is not a number"); return 0;}
+ 
+ 	if($x!=0 && $x =~ /^0/ && $x =~ /^\d*$/)	# "oh" numbers
+ 	{	if($front) 
+@@ -641,7 +647,7 @@
+ 
+ 		if($back)
+ 		{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-			{	&pluralize;			# eg. 1960s
++			{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 				$back =~ s/^s//;
+ 			}
+ 			if($back)
+@@ -649,7 +655,7 @@
+ 				else {&appendo($back);}	# back = punct or "'s"
+ 			}
+ 		}
+-		return;
++		return 1;
+ 	}
+ 
+ 	if($x =~ /^\d/)			# get integer part
+@@ -675,48 +681,48 @@
+ 	if($sign) { &pusho($sign); }
+ 
+ 	$ones=int($intpart%100);
+-	if($comma) {&printint($intpart);}
++	if($comma) {if (! &printint($intpart)) {return 0;} }
+ 	elsif(($intpart>=1900 || $intpart>=1100 && $ones==0)
+ 		&& $intpart<2000 && !$fracpart)			#4 digit -> 2+2
+ 	{	$hun=int($intpart/100);
+-		&printint($hun);
+-		if($ones>=10) {&printint($ones);}
++		if (! &printint($hun)) {return 0;}
++		if($ones>=10) {if (! &printint($ones)) {return 0;} }
+ 		elsif($ones>0)
+ 		{	&pusho("oh");
+-			&printint($ones);
++			if (! &printint($ones)) {return 0;}
+ 		}
+ 		else {&pusho("hundred");}
+ 	}
+ 	else
+-	{	&printint($intpart);
++	{	if (! &printint($intpart)) {return 0;}
+ 		$y=$last;
+ 		$y =~ s/^\W*//;				# thize dates: May 25th
+ 		if(length($intpart)<=2 && $months{$y})
+-		{	&thize("");
++		{	if (! &thize("")) {return 0;}
+ 			$back =~ s/[a-z]//g;
+ 		}
+ 	}
+-	if($fracpart) {&printdecfrac($fracpart);}
++	if($fracpart) {if (! &printdecfrac($fracpart)) {return 0;} }
+ 
+ 	if($back)
+ 	{	if($back =~ /^s$/ || $back =~ /^s\W/)	# back = s
+-		{	&pluralize;			# eg. 1960s
++		{	if (! &pluralize) {return 0;}			# eg. 1960s
+ 			$back =~ s/^s//;
+ 		}
+ 		if($back =~ /^st$/ || $back =~ /^st\W/)	# back= st
+-		{	&thize("st");			# eg. 1st
++		{	if (! &thize("st")) {return 0;}			# eg. 1st
+ 			$back =~ s/^st//;
+ 		}
+ 		if($back =~ /^nd$/ || $back =~ /^nd\W/)	# back= nd
+-		{	&thize("nd");			# eg. 2nd
++		{	if (! &thize("nd")) {return 0;}			# eg. 2nd
+ 			$back =~ s/^nd//;
+ 		}
+ 		if($back =~ /^rd$/ || $back =~ /^rd\W/)	# back= rd
+-		{	&thize("rd");			# eg. 3rd
++		{	if (! &thize("rd")) {return 0;}			# eg. 3rd
+ 			$back =~ s/^rd//;
+ 		}
+ 		if($back =~ /^th$/ || $back =~ /^th\W/)	# back= th
+-		{	&thize("th");			# eg. 4th
++		{	if (! &thize("th")) {return 0;}			# eg. 4th
+ 			$back =~ s/^th//;
+ 		}
+ 		if($back)
+@@ -724,6 +730,7 @@
+ 			else {&appendo($back);}	# back = punct or "'s"
+ 		}
+ 	}
++  return 1;
+ }
+ 
+ sub printdate			# printdate(n):	x/x/x format
+@@ -741,7 +748,7 @@
+ 	$back=$1;
+ 
+ 	if($x !~ /^\d{1,2}\/\d{1,2}\/(19)?\d{2}$/)
+-		{&perr("printdate: $_[0] is not a date");}
++		{&perr("printdate: $_[0] is not a date"); return 0;}
+ 
+ 	@y=split(/\//,$x);
+ 	$y[2] =~ s/^19(\d{2})$/$1/;
+@@ -752,20 +759,21 @@
+ 		$appendflg=1;
+ 	}
+ 
+-	&printint($y[0]);
++	if (! &printint($y[0])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[1]);
++	if (! &printint($y[1])) {return 0;}
+ 	&appendo("/");
+ 
+ 	$appendflg=1;
+-	&printint($y[2]);
++	if (! &printint($y[2])) {return 0;}
+ 
+ 	if($back)
+ 	{	if($back =~ /^[a-zA-Z]/) {&appendo("-");}
+ 		&appendo($back);
+ 	}
++  return 1;
+ }
+ 
+ sub printserno			# printserno(n): eg. B1, 3b2, 10W-40
+@@ -815,12 +823,12 @@
+ 		}		     # (should expand here unless in dictionary)
+ 		$x =~ s/^(\d*)//;	# strip off dig
+ 		$y=$1;
+-		if($y ne "") { &printdigstr($y); }
++		if($y ne "") { if (! &printdigstr($y)) {return 0;} }
+ 	}
+ 
+ 	if($back =~ /^s\b/)	# back = s
+ 	{			# eg. 2C60s
+-	    &pluralize;
++	    if (! &pluralize) {return 0;} 
+ 	    $back =~ s/^s//;
+ 	}
+ 	if($back)
+@@ -828,6 +836,7 @@
+ 		else {&appendo($back);}
+ 	}
+ 	$appendflg=0;
++  return 1;
+ }
+ 
+ sub printdigstr			# printdigstr(x)
+@@ -841,14 +850,13 @@
+ 	if($x =~ /^0/)			# leading zero
+ 	{	while($x ne "")
+ 		{	$x =~ s/^(.)//;
+-			if($1 !~ /\d/) {&perr("printdigstr: non-digit");}
++			if($1 !~ /\d/) {&perr("printdigstr: non-digit"); return 0;}
+ 			&pusho("$ones_z[$1]");
+ 		}
+ 		return;
+ 	}
+ 	if($x =~ /^\d0*$/)		# d, d0, d00, d000, etc
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 
+ 	$_=$x;
+@@ -857,30 +865,29 @@
+ 	for($k=0;$y[$k]==0;$k++) {}			# k= nr following 0s
+ 
+ 	if($j==2)			# 2 dig
+-	{	&printint($x);
+-		return;
++	{	return &printint($x);
+ 	}
+ 	if($j==3)
+-	{	&printint($y[2]);
++	{	if (! &printint($y[2])) {return 0;}
+ 		if($y[1]==0) {&pusho("oh");}
+-		&printint("$y[1]$y[0]");
+-		return;
++		return &printint("$y[1]$y[0]");
+ 	}
+ 	if($j==5 && $k<=2)
+-	{	&printint("$y[4]");
++	{	if (! &printint("$y[4]")) {return 0;}
+ 		$j=4;
+ 	}
+ 	if($j==4)
+-	{	&printint("$y[3]$y[2]");
++	{	if (! &printint("$y[3]$y[2]")) {return 0;}
+ 		if($k==2) {&pusho("hundred");}
+ 		else
+ 		{	if($y[1]==0) {&pusho("oh");}
+-			&printint("$y[1]$y[0]");
++			return &printint("$y[1]$y[0]");
+ 		}
+-		return;
++		return 1;
+ 	}
+ 						# >5 dig: just sequential dig
+ 	for($j--;$j>=0;$j--) {&pusho("$ones_oh[$y[$j]]");}
++  return 1;
+ }
+ 
+ sub printftin			# printftin(n): eg. 6\'-4\"
+@@ -905,19 +912,19 @@
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad feet");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad feet"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-foot");}
+ 	else {&appendo("-feet");}
+ 
+ 	$x =~ s/^\'//;	# strip off \'
+ 	$x =~ s/^-//;	# strip off -
+-	if(!$x) {&perr("printftin: bad intermed");}
++	if(!$x) {&perr("printftin: bad intermed"); return 0;}
+ 
+ 	$x =~ s/^([\d\.]*)//;	# strip off dig & .
+ 	$y=$1;
+-	if(!$y) {&perr("printftin: bad inches");}
+-	&printnum($y);
++	if(!$y) {&perr("printftin: bad inches"); return 0;}
++	if (! &printnum($y)) {return 0;}
+ 	if($y==1) {&appendo("-inch");}
+ 	else {&appendo("-inches");}
+ 
+@@ -925,6 +932,7 @@
+ 	{	if($back !~ /^[a-zA-Z]/) {&appendo($back);}
+ 		else {&pusho($back);}
+ 	}
++  return 1;
+ }
+ 
+ sub printint			# printint(x)
+@@ -968,13 +976,14 @@
+ 			}
+ 			if(int($j/3)>0)
+ 			{	if(int($j/3) > $#mult)
+-					{ &perr("printint: too big"); }
++					{ &perr("printint: too big"); return 0;}
+ 				&pusho($mult[int($j/3)]);
+ 			}
+ 			$commanextflg=1;
+ 		}
+ 	}
+ 	$commanextflg=0;
++  return 1;
+ }
+ 
+ sub printdecfrac
+@@ -989,6 +998,8 @@
+ 	if($leadingzeroflg)
+ 		{for($j=0;$j<=$#y;$j++) { &pusho($ones_z[$y[$j]]);}}
+ 	else {for($j=0;$j<=$#y;$j++) { &pusho($ones_oh[$y[$j]]);}}
++
++  return 1;
+ }
+ 
+ sub pluralize		# pluralize(): pluralize last entry on output stack
+@@ -1016,7 +1027,9 @@
+ 		$x =~ s/y$/ies/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("pluralize: unknown word: $_");}
++	else {&perr("pluralize: unknown word: $_"); return 0;}
++
++  return 1;
+ }
+ 
+ sub thize		# thize(): add th to last entry on output stack
+@@ -1028,50 +1041,51 @@
+ 	$_=&geto;
+ 	if( /four$/ || /six$/ || /seven$/ || /ten$/ ||
+ 		/eleven$/ || /een$/ || /hundred$/ || /thousand$/ || /illion$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # xth
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # xth
+ 		&appendo("th");
+ 	}
+ 	elsif( /one$/ )						# 1st
+-	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "st") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/one$/first/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /two$/ )						# 2nd
+-	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "nd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/two$/second/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /three$/ )					# 3rd
+-	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "rd") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/three$/third/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /five$/ || /twelve$/ )				# 5th, 12th
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ve$/fth/;
+ 		&pusho($x);
+ 	}
+ 	elsif(/eight$/)
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");} # 8th
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;} # 8th
+ 		&appendo("h");
+ 	}
+ 	elsif( /nine$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/nine$/ninth/;
+ 		&pusho($x);
+ 	}
+ 	elsif( /ty$/ )
+-	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n");}
++	{	if($y && $y ne "th") {&perr("thize: mismatch: $_ $y\n"); return 0;}
+ 		$x=&popo();
+ 		$x =~ s/ty$/tieth/;
+ 		&pusho($x);
+ 	}
+-	else {&perr("thize: unknown word: $_");}
++	else {&perr("thize: unknown word: $_"); return 0;j}
++  return 1;
+ }
+ 
+ sub pusho				# pusho($x): push output
+@@ -1089,17 +1103,17 @@
+ sub appendo				# appendo($x): append to output
+ {	$appendflg=0;		
+ #	if($#output < 0) {&pusho("");}
+-	if($#output < 0) {&perr("appendo: output empty");}
++	if($#output < 0) {&perr("appendo: output empty"); return 0;}
+ 	$output[$#output] .= @_[0];
+ }
+ 
+ sub popo				# popo(): pop last output
+-{	if($#output < 0) {&perr("popo: output empty");}
++{	if($#output < 0) {&perr("popo: output empty"); return 0;}
+ 	pop(@output);
+ }
+ 
+ sub geto				# geto(): get last output
+-{	if($#output < 0) {&perr("geto: output empty");}
++{	if($#output < 0) {&perr("geto: output empty"); return 0;}
+ 	return $output[$#output];
+ }
+ 
+@@ -1111,8 +1125,6 @@
+ 	$appendflg=0;
+ 	$commanextflg=0;
+ 	&pusho($this);
+-	$field++;		# graceful error recovery
+-	goto wloop;
+ }
+ 
+ sub perr2
+diff -Naur tools/csr4_utils/pare-sgml.perl local/data_prep/csr_hub4_utils/pare-sgml.perl
+--- tools/csr4_utils/pare-sgml.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/pare-sgml.perl	2017-11-03 13:22:09.486213159 -0400
+@@ -1,11 +1,14 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: pare-sgml.perl,v 1.3 1996/08/15 02:51:17 robertm Rel $
+ # removes extraneous headers and other non-LM fields
+ # translates <DOC ...> into LM-standard <art ...>
+ # removes comments (enclosed in brackets)
+ 
+-$intext=0;
++use strict;
++use warnings;
++
++my $intext=0;
+ while (<>)
+ {
+     if ($intext == 0)
+diff -Naur tools/csr4_utils/process_filelist.sh local/data_prep/csr_hub4_utils/process_filelist.sh
+--- tools/csr4_utils/process_filelist.sh	1969-12-31 19:00:00.000000000 -0500
++++ local/data_prep/csr_hub4_utils/process_filelist.sh	2017-11-03 13:22:09.490213160 -0400
+@@ -0,0 +1,30 @@
++#! /bin/bash
++
++set -e 
++set -o pipefail
++set -u
++set -x
++
++if [ $# -ne 2 ]; then
++  echo "Usage: $0 <filelist> <dir>"
++  exit 1
++fi
++
++filelist=$1
++dir=$2
++
++export PATH=$PATH:tools/csr4_utils
++
++for file in `cat $filelist`; do
++	BASENM=`basename $file`
++  name="${BASENM%.*}"
++
++	echo "Running LM pipeline for |$BASENM|..." 1>&2
++  gunzip -c $file | pare-sgml.perl | \
++    bugproc.perl | \
++    numhack.perl | \
++    numproc.perl -xtools/csr4_utils/num_excp | \
++    abbrproc.perl tools/csr4_utils/abbrlist | \
++    puncproc.perl -np | gzip -c > $dir/$name.txt.gz
++	echo "Done with $BASENM."
++done
+diff -Naur tools/csr4_utils/progsummary.perl local/data_prep/csr_hub4_utils/progsummary.perl
+--- tools/csr4_utils/progsummary.perl	1996-07-12 09:26:35.000000000 -0400
++++ local/data_prep/csr_hub4_utils/progsummary.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # Program:	progsummary.perl
+ # Written by:	dave graff
+diff -Naur tools/csr4_utils/puncproc.perl local/data_prep/csr_hub4_utils/puncproc.perl
+--- tools/csr4_utils/puncproc.perl	1996-08-27 15:25:17.000000000 -0400
++++ local/data_prep/csr_hub4_utils/puncproc.perl	2017-11-03 13:22:09.494213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl
++#!/usr/bin/perl
+ 
+ # $Id: puncproc.perl,v 1.2 1996/08/05 16:12:42 robertm Rel $
+ ###############################################################################
+@@ -59,7 +59,7 @@
+ 						# forbidden symbols
+ 	if(/</) {&perr("<");}				# <
+ 	if(/>/) {&perr(">");}				# >
+-	if(/\$/) {&perr("$");}				# $
++	if(/\$/) {&perr("\$");}				# $
+ 	if(/_/) {&perr("_");}				# _
+ 	if(/\d/) {&perr("[0-9]");}			# 0-9
+ 
+diff -Naur tools/csr4_utils/tr-bn-char.fast.perl local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl
+--- tools/csr4_utils/tr-bn-char.fast.perl	1996-08-21 02:39:12.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.fast.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -pi.old-char
++#!/usr/bin/perl -pi.old-char
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
+diff -Naur tools/csr4_utils/tr-bn-char.slow.perl local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl
+--- tools/csr4_utils/tr-bn-char.slow.perl	1996-08-21 01:30:18.000000000 -0400
++++ local/data_prep/csr_hub4_utils/tr-bn-char.slow.perl	2017-11-03 13:22:09.502213160 -0400
+@@ -1,4 +1,4 @@
+-#!/usr/local/bin/perl -p
++#!/usr/bin/perl -p
+ 
+ # handles nonprinting characters in Broadcast News material, to the extent
+ # that they can be handled, and perhaps a bit beyond...
diff --git a/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
new file mode 100755
index 00000000000..84913e9a8b0
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1996_bn_data.pl
@@ -0,0 +1,131 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University 
+#                        (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+use List::Util qw(max);
+
+my $audio_width=1;
+my $speaker_width=1;
+my $time_width=1;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+if (@ARGV != 3) {
+  print STDERR "$0: Error: Unsupported number of arguments: " . scalar @ARGV ."\n";
+  print STDERR "  Usage: $0 <audio-files> <transripts> <destination>\n";
+  print STDERR "  where\n";
+  print STDERR "    <audio-files> is a file containing list of audio files\n";
+  print STDERR "      (single absolute path name per line)\n";
+  print STDERR "    <transcripts> is a file containing transcripts obtained\n";
+  print STDERR "      obtained by processing the official SGML format\n";
+  print STDERR "      transcripts. See parse_sgm.pl for further info.\n";
+  print STDERR "    <destination> target directory (should already exist)\n";
+  print STDERR "  See also: local/parse_sgm.pl\n";
+  die;
+}
+
+my $audio_files = $ARGV[0];
+my $transcripts = $ARGV[1];
+my $out = $ARGV[2];
+
+my %AUDIO;
+open(my $audio_f, "<", $audio_files) 
+  or die "$0: Error: Could not open $audio_files: $!\n";
+while(my $line = <$audio_f>) {
+  chomp $line;
+  (my $basename = $line) =~ s/.*\/([^\/]+).sph/$1/g;
+  $basename =~ s/_$//g;
+  $AUDIO{$basename} = $line;
+}
+close($audio_f);
+
+my %TRANSCRIPT;
+open(my $transcript_f, "<:encoding(utf-8)", $transcripts)
+  or die "$0: Error: Could not open $transcripts: $!\n";
+while(my $line = <$transcript_f>) {
+  chomp $line;
+  my @F = split / /, $line, 8;
+  push @{$TRANSCRIPT{$F[0]}}, \@F;
+
+  my $f1 = $F[0];
+  my $f2 = $F[1];
+  my $speaker = $F[2];
+  my $t1 = $F[5];
+  my $t2 = $F[6];
+
+  $time_width = max $time_width, length($t1), length($t2);
+  $speaker_width = max $speaker_width, length($speaker);
+  $audio_width = max $audio_width, length($f1);
+}
+close($transcript_f);
+#print Dumper(\%TRANSCRIPT);
+
+print STDERR $time_width . " " . $speaker_width . " " . $audio_width . "\n";
+
+my $sph2pipe = `which sph2pipe` or do {
+  die "$0: Error: sph2pipe is not installed. Did you run make in the tools/ directory?\n";
+};
+chomp $sph2pipe;
+
+open(my $wav_file, ">", "$out/wav.scp") 
+  or die "$0: Error: Cannot create file $out/wav.scp: $!\n";
+open(my $text_file, ">:encoding(utf-8)", "$out/text") 
+  or die "$0: Error: Cannot create file $out/text: $!\n";
+open(my $segments_file, ">", "$out/segments") 
+  or die "$0: Error: Cannot create file $out/segments: $!\n";
+open(my $spk_file, ">", "$out/utt2spk") 
+  or die "$0: Error: Cannot create file $out/utt2spk: $!\n";
+
+foreach my $file (sort keys %AUDIO) {
+  print "$0 Error: $file does not exist in transcripts!\n"  
+    unless exists $TRANSCRIPT{$file};
+  my $transcripts = $TRANSCRIPT{$file};
+
+  #my $file_fmt = sprintf("%0${audio_width}s", $file);
+  my $file_fmt = sprintf("%s", $file);
+
+  print $wav_file "$file_fmt $sph2pipe -f wav $AUDIO{$file}|\n";
+
+  foreach my $utt (@{$transcripts}) {
+    my $start = $utt->[5] + 0.0;  
+    my $end = $utt->[6] + 0.0;
+    if ($end - $start < 0.005) {   # remove very short segments
+      next;
+    }
+    my $start_time = sprintf("%0${time_width}d", $utt->[5]*1000);  
+    my $end_time = sprintf("%0${time_width}d", $utt->[6]*1000);
+    my $spk = sprintf("%0${speaker_width}s", $utt->[2]);
+    # my $spk = sprintf("%s", $utt->[2]);
+    my $spkid = "${file_fmt}_${spk}";
+    my $uttid = "${file_fmt}_${spk}_${start_time}_${end_time}";
+
+    print $text_file "$uttid $utt->[7]\n";
+    print $spk_file "$uttid $spkid\n";
+    print $segments_file "$uttid $file_fmt $start $end\n";
+  }
+}
+
+close($wav_file);
+close($text_file);
+close($segments_file);
+close($spk_file);
diff --git a/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
new file mode 120000
index 00000000000..844c16bbe06
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/format_1997_bn_data.pl
@@ -0,0 +1 @@
+format_1996_bn_data.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/data_prep/hub4_utils.py b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
new file mode 100644
index 00000000000..4ee9eab1c7e
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/hub4_utils.py
@@ -0,0 +1,156 @@
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This module contains utilities for preparing the HUB4 broadcast news
+evaluation corpora.
+"""
+
+import os
+import re
+import sys
+
+
+def parse_uem_line(reco, line):
+    """This method parses a 'line' from the UEM for recording 'reco'
+    and returns the line converted to kaldi segments format.
+    The format of UEM is
+    <file-id> <channel> <start-time> <end-time>
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        return None
+    parts = line.split()
+
+    if reco is None:
+        reco = parts[0]
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+
+    start_time = float(parts[2])
+    end_time = float(parts[3])
+
+    utt = "{0}-{1:06d}-{2:06d}".format(reco, int(start_time * 100),
+                                       int(end_time * 100))
+    return "{0} {1} {2} {3}".format(utt, reco, start_time, end_time)
+
+
+def parse_cmu_seg_line(line, prepend_reco_to_spk=False):
+    """This line parses a 'line' from the CMU automatic segmentation for
+    recording.
+    The CMU segmentation has the following format:
+    <file> <channel> <speaker> <start-time> <end-time> <condition>
+
+    e.g.:
+    h4e_98_1 1 F0-0000     0.00    28.22 F0
+
+    We force the channel to be 1 and take the file-id to be the recording-id.
+    """
+    line = line.strip()
+    if len(line) == 0 or line[0:2] == ";;":
+        return None
+    parts = line.split()
+
+    # Actually a file, but we assuming 1-1 mapping to recording and force
+    # channel to be 1.
+    reco = parts[0]
+
+    # The channel ID is expected to be 1.
+    if parts[1] != "1":
+        raise TypeError("Invalid line {0}".format(line))
+    spk = parts[2]
+
+    start_time = float(parts[3])
+    end_time = float(parts[4])
+
+    if prepend_reco_to_spk:
+        spk = reco + '-' + spk
+        utt = "{spk}-{0:06d}-{1:06d}".format(int(start_time * 100),
+                                             int(end_time * 100), spk=spk)
+    else:
+        utt = "{spk}-{reco}-{0:06d}-{1:06d}".format(int(start_time * 100),
+                                                    int(end_time * 100),
+                                                    reco=reco, spk=spk)
+
+    segment_line = "{0} {1} {st:.3f} {end:.3f}".format(
+        utt, reco, st=start_time, end=end_time)
+    utt2spk_line = "{0} {1}".format(utt, spk)
+
+    return (segment_line, utt2spk_line)
+
+
+def normalize_csr_transcript(text, noise_word, spoken_noise_word):
+    """Normalize broadcast news transcript for audio."""
+    text = text.upper()
+
+    # Remove long event markings
+    text = re.sub(r"\[[^]/]+/\]|\[/[^]/]+\]", "", text)
+    # Remove comments
+    text = re.sub(r"\{\{[^}]*\}\}", "", text)
+    # Replace alternative words with a single one (second alternative)
+    text = re.sub(r"\{[^}/]+/([^}/]+)[^}]*\}", r"\1", text)
+    # Remove partial word completions
+    text = re.sub(r"\([^)]+\)-|-\([^)]+\)", "-", text)
+    # Remove accent marks and diacritics
+    text = re.sub(r"\\[3-8]", "", text)
+
+    # Remove unclear speech markings
+    text = re.sub(r"\(\(([^)]*)\)\)", r"\1", text)
+    text = re.sub(r"#", "", text)   # Remove overlapped speech markings
+    # Remove invented word markings
+    text = re.sub(r"\*\*([^*]+)\*\*", r"\1", text)
+    # Replace speaker-made noises with <SPOKEN_NOISE>
+    text = re.sub(r"\[INHALING\]|\[COUGH\]|\[THROAT_CLEARING\]|\[SIGN\]",
+                  spoken_noise_word, text)
+    # Replace noise with <NOISE>
+    text = re.sub(r"\[[^]]+\]", noise_word, text)
+    text = re.sub(r"\+([^+]+)\+", r"\1", text)
+
+    # Remove periods after letter.
+    text = re.sub(r"([A-Z])\.( |$)", r"\1 ", text)
+    # Replace \. with .
+    text = re.sub(r"\\.", r".", text)
+
+    text1 = []
+    for word in text.split():
+        if word == spoken_noise_word or word == noise_word:
+            text1.append(word)
+            continue
+
+        # Remove mispronunciation brackets
+        word = re.sub(r"^@(\w+)$", r"\1", word)
+        # Remove everything other than the standard ASCII symbols
+        word = re.sub("[^A-Za-z0-9.' _-]", "", word)
+        text1.append(word)
+    return " ".join(text1)
+
+
+def remove_punctuations(text):
+    """Remove punctuations and some other processing for text sentence."""
+    # Remove HTML new lines that are not end of sentences
+    text1 = re.sub("\n", " ", text)
+
+    # Remove some markers like double dash that are normally used to separate
+    # name titles in newspapers.
+    text1 = re.sub(r"(&[^;]+;|--)", " ", text1)
+
+    # Remove quotation marks
+    text1 = re.sub(r"''|``|\(|\)", " ", text1)
+
+    # Remove everything other than the standard ASCII symbols
+    text1 = re.sub("[^A-Za-z0-9.' _-]", "", text1)
+
+    # Replace multiple .'s with single and then remove isolated '.'
+    text1 = re.sub(r"\.[.]+ ", ".", text1)
+    text1 = re.sub(r" \. ", " ", text1)
+
+    # Remove isolated '-'
+    text1 = re.sub(r" - ", " ", text1)
+
+    # Replace multiple spaces with single.
+    text1 = re.sub(r"[ ]+", " ", text1)
+
+    return text1
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
new file mode 100755
index 00000000000..3db0e1c71c3
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn96_transcripts.pl
@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn96_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
new file mode 100755
index 00000000000..b27f8da65f8
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/normalize_bn97_transcripts.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/env perl
+
+# Copyright 2017  Vimal Manohar
+# Apache 2.0
+
+@ARGV == 2 ||  die "usage: normalize_bn97_transcripts.pl noise_word spoken_noise_word < transcript > transcript2";
+$noise_word = shift @ARGV;
+$spoken_noise_word = shift @ARGV;
+
+while(<STDIN>) {
+    $_ =~ m:^(\S+) (.+): || die "bad line $_";
+    $utt = $1;
+    $trans = $2;
+    print "$utt";
+
+    $trans =~ tr:a-z:A-Z:;
+    $trans =~ s:\(\(([^)]*)\)\):$1 :g;   # Remove unclear speech markings
+    $trans =~ s:#: :g; # Remove overlapped speech markings
+    $trans =~ s:\*\*([^*]+)\*\*:$1 :g;       # Remove invented word markings
+    $trans =~ s:\[[^]]+\]:$noise_word :g; 
+    $trans =~ s:\{[^}]+\}:$spoken_noise_word :g;
+    $trans =~ s:^[+]([^+]+)[+]$:$1:;   # Remove mispronunciation brackets
+    foreach $w (split (" ",$trans)) {
+        if ($w ne $noise_word && $w ne $spoken_noise_word) {
+          $w =~ s:[?.,!]+$::;   # Remove punctuations
+          $w =~ s:^@(.*)$:$1:;  # Remove best guess marking for proper nouns
+          $w =~ s:^[\^](.*)$:$1:;  # Remove capitalization marks
+          $w =~ s:_([A-Z])'S$:$1.'S :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:_([A-Z]):$1. :g;  # Normalize abbreviations from _f_b_i to f. b. i.
+          $w =~ s:[ ]+$::;  # Remove trailing spaces
+        }
+
+        print " $w";
+    }
+    print "\n";
+}
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
new file mode 100755
index 00000000000..37487296809
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1996_hub4_eng.pl
@@ -0,0 +1,229 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<segment/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'s_time'};
+      $segment_end = $tags{'e_time'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["segment", \%tags];
+      ;
+    } elsif ($line =~ /<\/segment/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<sync/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'time'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/sync/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+
+  }
+  close($f);
+}
diff --git a/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
new file mode 100755
index 00000000000..fe5ea13779f
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/parse_sgm_1997_hub4_eng.pl
@@ -0,0 +1,228 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright (c) 2017  Johns Hopkins University (Author: Jan "Yenda" Trmal <jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+require HTML::Parser or die "This script needs HTML::Parser from CPAN";
+HTML::Parser->import();
+
+binmode(STDOUT, ":utf8");
+
+sub  trim { my $s = shift; $s =~ s/^\s+|\s+$//g; return $s };
+
+sub parse_sgml_tag {
+  my $tag = shift(@_);
+  my %ret;
+  
+  if ($tag !~ /=/) {
+    return %ret;
+  }
+  
+  $tag =~ s/<[a-zA-Z]+ //;
+  $tag =~ s/> *$//;
+  #print $tag . "\n";
+
+  my @key_value_pairs = split / *,? +/, $tag;
+  for my $entry(@key_value_pairs) {
+    (my $key, my $value) = split '=', $entry, 2;
+    $ret{$key}=$value;
+  }
+  return %ret;
+}
+
+if (@ARGV != 1) {
+  print STDERR "$0: This script needs exactly one parameter (list of SGML files)\n";
+  print STDERR "  Usage: $0 <transripts>\n";
+  print STDERR "  where\n";
+  print STDERR "    <transcripts> is a file containing the official SGML format\n";
+  print STDERR "      transcripts. The files are parsed and the parsed representation\n";
+  print STDERR "      is dumped to STDOUT (one utterance + the additional data fields\n";
+  print STDERR "      per line (we dump all the fields, but not all fields are used\n";
+  print STDERR "      in the recipe).\n";
+  die;
+}
+my $filelist=$ARGV[0];
+
+my $p = HTML::Parser->new();
+
+my @files=();
+open(F, '<', $filelist) or die "Could not open file $filelist: $?\n";
+while(<F>) {
+  chomp;
+  push @files, $_;
+}
+
+foreach my $file (@files) {
+  my $reporter="";
+  my $start = -1;
+  my $end = -1;
+  my $segment_start = -1;
+  my $segment_end = -1;
+  my $segment_speaker;
+  my $segment_fidelity = "XXX";
+  my $segment_mode = "XXX";
+  my $section_start = -1;
+  my $section_end = -1;
+  my $filename = "";
+  my $seq = 0;
+  my @text = ();
+  my $time;
+  my @tagqueue;
+
+  my $sgml_file = `basename $file`;
+  $sgml_file = trim $sgml_file;
+  $sgml_file =~ s/\.txt$//g;
+  $sgml_file =~ s/\.sgml$//g;
+  $sgml_file =~ s/_$//g;
+
+  open(my $f, '<:encoding(iso-8859-1)', $file) or die "Could not open file $file: $?\n";
+
+  while(my $line = <$f>) {
+    chomp $line;
+    $line = trim $line;
+    $line = lc $line;
+    next unless $line;
+
+    if ($line =~ /<episode/) {
+      my %tags = parse_sgml_tag $line;
+      $filename = $tags{'filename'};
+      $filename =~ s/"//g;
+      $filename =~ s/\.sph//g;
+
+      if ($sgml_file ne $filename) {
+        print STDERR "$0: WARNING: SGML filename does not match episode filename $filename in file $file\n";
+      }
+      #print "BS: $line\n";
+      push @tagqueue, ["episode", \%tags];
+      ;
+    } elsif ($line =~ /<\/episode/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<section/) {
+      my %tags = parse_sgml_tag $line;
+
+      if ($tags{'s_time'}) {
+        $section_start = $tags{'s_time'};
+      } else {
+        $section_start = $tags{'starttime'};
+      }
+
+      if ($tags{'e_time'}) {
+        $section_end = $tags{'e_time'};
+      } else {
+        $section_end = $tags{'endtime'};
+      }
+
+      #print "BS: $line\n";
+      push @tagqueue, ["section", \%tags];
+      ;
+    } elsif ($line =~ /<\/section/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+      #print "ES: $line\n";
+      ;
+    } elsif ($line =~ /<turn/) { 
+      #print "BT: $line\n";
+      my %tags = parse_sgml_tag $line;
+      $segment_speaker = $tags{'speaker'};
+      $segment_speaker =~ s/"//g;
+      $segment_start = $tags{'starttime'};
+      $segment_end = $tags{'endtime'};
+      $segment_fidelity = $tags{'fidelity'} if $tags{'fidelity'};
+      $segment_mode = $tags{'mode'} if $tags{'mode'};
+      $time = $segment_start;
+      push @tagqueue, ["turn", \%tags];
+      ;
+    } elsif ($line =~ /<\/turn/) {
+      my $p = pop @tagqueue;
+      $line =~ s/<\/(.*)( +.*)?>/$1/g;
+      $line = trim $line;
+      die "Unaligned tags: '" . $p->[0] . "' vs '$line'" if $p->[0] ne $line;
+
+      #print join(" ", @text) . "\n" if @text > 0;
+      my $new_time = $segment_end;
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = 0;
+      $segment_speaker = "XXX";
+      $segment_start = "XXX";
+      $segment_end = "XXX";
+      $segment_fidelity = "XXX";
+      $segment_mode = "XXX";
+      #print "ET: $line\n";
+      ;
+    } elsif ($line =~ /<time/) {
+      my %tags = parse_sgml_tag $line;
+      my $new_time = $tags{'sec'};
+      if (@text > 0) {
+        print "$sgml_file $filename $segment_speaker $segment_fidelity $segment_mode $time $new_time ";
+        print join(" ", @text) . "\n";
+      }
+      @text = ();
+      $time = $new_time;
+      ;
+    } elsif ($line =~ /<\/time/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<\/overlap/) {
+      #print $line;
+      ;
+    } elsif ($line =~ /<background/ || $line =~ /<comment/) {
+      # ignore line;
+    } elsif ($line =~ /<foreign/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/foreign/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<unclear/) {
+      $line = trim $line;
+      push @text, $line;
+    } elsif ($line =~ /<\/unclear/) {
+      $line = trim $line;
+      push @text, $line;
+      ;
+    } elsif ($line =~ /<[^\/]/) {
+      parse_sgml_tag $line;
+      print STDERR "$0: INFO: Unknown tag $line in file $file\n";
+    } elsif ($line =~ /<\//) {
+      ;
+    } else {
+      $line = trim $line;
+      push @text, $line if $line;
+      ;
+    }
+  }
+  close($f);
+}
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
new file mode 100755
index 00000000000..afa6d7e6531
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1995_csr_hub4_corpus.sh
@@ -0,0 +1,63 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the 1995 CSR-IV HUB4 corpus
+# https://catalog.ldc.upenn.edu/LDC96S31
+
+set -e
+set -o pipefail
+set -u
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora5/LDC/LDC96S31/csr95_hub4 data/local/data/csr95_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+for d in $SOURCE_DIR/csr95/h4/devtst $SOURCE_DIR/csr95/h4/evltst \
+  $SOURCE_DIR/csr95/h4/train; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC96S31 corpus"
+    exit 1
+  fi
+done
+
+mkdir -p $dir
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.wav`; do
+  y=`basename $x`
+  z=${y%.wav}
+  echo "$z $x"
+done > $dir/wav_scp
+
+cat $dir/wav_scp | grep "csr95/h4/train" > $dir/train95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/devtst" > $dir/dev95_wav_scp
+cat $dir/wav_scp | grep "csr95/h4/evltst" > $dir/eval95_wav_scp
+
+rm $dir/*_{segments,utt2spk,text} || true
+
+ls $SOURCE_DIR/csr95/h4/train/*.txt > $dir/train95_text.list
+ls $SOURCE_DIR/csr95/h4/devtst/*.txt > $dir/dev95_text.list
+ls $SOURCE_DIR/csr95/h4/evltst/*.txt > $dir/eval95_text.list
+
+for x in `ls $SOURCE_DIR/csr95/h4/*/*.txt`; do
+  if [[ $x =~ "csr95/h4/train" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/train95_segments $dir/train95_utt2spk $dir/train95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/devtst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/dev95_segments $dir/dev95_utt2spk $dir/dev95_text
+  fi
+  
+  if [[ $x =~ "csr95/h4/evltst" ]]; then
+    local/data_prep/process_1995_bn_annotation.py $x \
+      $dir/eval95_segments $dir/eval95_utt2spk $dir/eval95_text
+  fi
+done
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
new file mode 100755
index 00000000000..ea4e5699ce3
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1996 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC97S44 
+# /export/corpora/LDC/LDC97T22
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC97T22/hub4_eng_train_trans /export/corpora/LDC/LDC97S44/data data/local/data/train_bn96"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+speech_source_dir=$2  # /export/corpora/LDC/LDC97S44/data
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/*/*.txt > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm_1996_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1996 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
new file mode 100755
index 00000000000..f3f9c939e0b
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh
@@ -0,0 +1,59 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the 1996 CSR HUB4 Language Model corpus
+# https://catalog.ldc.upenn.edu/LDC98T31
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+stage=0
+
+[ -f ./path.sh ] && . ./path.sh
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T31/1996_csr_hub4_model data/local/data/csr96_hub4"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for d in $SOURCE_DIR/st_train/ $SOURCE_DIR/st_test/; do
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC98T31 corpus"
+    exit 1
+  fi
+  ls $d/*.stZ 
+done | sort > $dir/filelist
+
+mkdir -p $dir/split$nj/
+
+if [ $stage -le 1 ]; then
+  eval utils/split_scp.pl $dir/filelist $dir/split$nj/filelist.{`seq -s, $nj`}
+  $cmd JOB=1:$nj $dir/log/process_text.JOB.log \
+    local/data_prep/process_1996_csr_hub4_lm_filelist.py \
+    $dir/split$nj/filelist.JOB $dir
+fi
+
+for x in `ls $SOURCE_DIR/st_train/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/train.filelist
+
+for x in `ls $SOURCE_DIR/st_test/*.stZ`; do
+  y=`basename $x`
+  name=${y%.stZ}
+  echo $dir/${name}.txt.gz
+done > $dir/test.filelist
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
new file mode 100755
index 00000000000..7c11531dda5
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh
@@ -0,0 +1,99 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1996 English Broadcast News Dev and Eval (HUB4)
+# https://catalog.ldc.upenn.edu/LDC97S66
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval data/local/data/hub4_96_dev_eval"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+for d in $SOURCE_DIR/dev/devdata $SOURCE_DIR/eval/evaldata; do 
+  if [ ! -d $d ]; then
+    echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC97S66 corpus"
+    exit 1
+  fi
+done
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(None, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/${d}/${d}data/h496${suffix}.uem > $dir/${d}96_uem_segments
+  awk '{print $1" "$2}' $dir/${d}96_uem_segments > $dir/${d}96_uem_utt2spk
+done 
+
+for d in dev eval; do 
+  if [ $d == "dev" ]; then
+    suffix=dt
+  else
+    suffix=ev
+  fi
+
+  cat $SOURCE_DIR/${d}/${d}data/h496${suffix}.pem | \
+    python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line, prepend_reco_to_spk=True)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${d}96_pem_segments $dir/${d}96_pem_utt2spk
+done
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for x in `ls $SOURCE_DIR/dev/devdata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/dev96_wav_scp
+
+cat $dir/dev96_pem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_pem_wav_scp
+cat $dir/dev96_uem_segments | awk '{print $2}' | \
+  utils/filter_scp.pl /dev/stdin $dir/dev96_wav_scp > $dir/dev96_uem_wav_scp
+
+for x in `ls $SOURCE_DIR/eval/evaldata/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/eval96_wav_scp
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.glm $dir/glm
+
+cp $SOURCE_DIR/eval/evaldata/et96_1.utm $dir/eval96_utm
+cp $SOURCE_DIR/dev/devdata/et96_1.utm $dir/dev96_utm
+
+cp $SOURCE_DIR/eval/evaldata/h496ev.stm $dir/eval96_stm
+
+cp $SOURCE_DIR/dev/devdata/h496dtpe.stm $dir/dev96_pem_stm
+cp $SOURCE_DIR/dev/devdata/h496dtue.stm $dir/dev96_uem_stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
new file mode 100755
index 00000000000..5f049f7831c
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_bn_data.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#               2017  Vimal Manohar
+# License: Apache 2.0
+
+# This script prepares the 1997 English Broadcast News (HUB4) corpus.
+# /export/corpora/LDC/LDC98S71 
+# /export/corpora/LDC/LDC98T28
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset             # Treat unset variables as an error
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 <text-source> <speech-source> <out-dir>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T28/hub4e97_trans_980217 /export/corpora/LDC/LDC98S71/97_eng_bns_hub4 data/local/data/train_bn97"
+  exit 1
+fi
+
+text_source_dir=$1    # /export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+speech_source_dir=$2  # /export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+out=$3
+
+mkdir -p $out;
+
+ls $text_source_dir/transcrp/*.sgml > $out/text.list
+ls $speech_source_dir/*.sph > $out/audio.list
+
+if [ ! -s $out/text.list ] || [ ! -s $out/audio.list ]; then
+  echo "$0: Could not get text and audio files"
+  exit 1
+fi
+
+local/data_prep/parse_sgm_1997_hub4_eng.pl $out/text.list > \
+  $out/transcripts.txt 2> $out/parse_sgml.log || exit 1
+
+if [ ! -s $out/transcripts.txt ]; then
+  echo "$0: Could not parse SGML files in $out/text.list"
+  exit 1
+fi
+
+echo "$0: 1997 English Broadcast News training data (HUB4) prepared in $out"
+exit 0
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..1a0f6f8d372
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1997_hub4_bn_eng_eval.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1997 HUB4 English Evaluation corpus
+# https://catalog.ldc.upenn.edu/LDC2002S11
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC2002S11/hub4e_97 data/local/data/eval97"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2002S11 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_97.uem; do
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
+done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/h4e_evl/h4e_97.seg | \
+  python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+cp $SOURCE_DIR/h4e_evl/h4e_97_1.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_97.stm $dir/stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..3d9edf01579
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1998_hub4_bn_eng_eval.sh
@@ -0,0 +1,65 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1998 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S86
+
+set -e
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora/LDC/LDC2000S86/ data/local/data/eval98"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/h4e_evl/ ]; then
+  echo "$0: Invalid SOURCE-DIR $SOURCE_DIR for LDC2000S86 corpus"
+  exit 1
+fi
+
+for uem in $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.uem; do
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $uem
+done > $dir/segments
+awk '{print $1" "$2}' $dir/segments > $dir/utt2spk
+
+cat $SOURCE_DIR/h4e_evl/h4e_98_{1,2}.seg | \
+  python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/segments.pem $dir/utt2spk.pem
+ 
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+for x in `ls $SOURCE_DIR/h4e_evl/*.sph`; do
+  y=`basename $x`
+  z=${y%.sph}
+  echo "$z $sph2pipe -f wav $x |";
+done > $dir/wav.scp
+
+cp $SOURCE_DIR/h4e_evl/h4e_98.glm $dir/glm
+cp $SOURCE_DIR/h4e_evl/h4e_98.stm $dir/stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
new file mode 100755
index 00000000000..2d6a37228db
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_1999_hub4_bn_eng_eval.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares 1999 HUB4 Broadcast News Evaluation English Test Material
+# https://catalog.ldc.upenn.edu/LDC2000S88
+
+set -e 
+set -o pipefail
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <dir>"
+  echo "$0 /export/corpora5/LDC/LDC2000S88/hub4_1999 data/local/data/eval99"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+mkdir -p $dir
+
+if [ ! -d $SOURCE_DIR/bnews_99/ ]; then
+  echo "$0: Invalid SOURCE-DIR for LDC2000S88 corpus"
+  exit 1
+fi
+
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+sph2pipe=`which sph2pipe` || { echo "sph2pipe not found in PATH."; exit 1; }
+
+for f in bn99en_1 bn99en_2; do 
+  if [ "$f" == "bn99en_1" ]; then
+    affix=eval99_1
+  else
+    affix=eval99_2
+  fi
+
+  python -c '
+import sys, os
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+uem = sys.argv[1]
+reco, ext = os.path.splitext(os.path.basename(uem))
+for line in open(uem).readlines():
+  line = hub4_utils.parse_uem_line(reco, line)
+  if line is not None:
+    print (line)' $SOURCE_DIR/bnews_99/$f.uem > $dir/${affix}_uem_segments
+
+  awk '{print  $1" "$2}' $dir/${affix}_uem_segments > $dir/${affix}_uem_utt2spk
+
+  cat $SOURCE_DIR/bnews_99/$f.seg | \
+    python -c '
+import sys
+sys.path.insert(0, "local/data_prep")
+import hub4_utils
+with open(sys.argv[1], "w") as s_f, open(sys.argv[2], "w") as u_f:
+  for line in sys.stdin.readlines():
+    tup = hub4_utils.parse_cmu_seg_line(line)
+    if tup is not None:
+      segments_line, utt2spk_line = tup
+      s_f.write("{0}\n".format(segments_line))
+      u_f.write("{0}\n".format(utt2spk_line))' \
+        $dir/${affix}_pem_segments $dir/${affix}_pem_utt2spk
+  
+  echo "$f $sph2pipe -f wav $SOURCE_DIR/bnews_99/$f.sph |" > ${dir}/${affix}_wav_scp
+done 
+
+cp $SOURCE_DIR/bnews_99/en981118.glm $dir/eval99_1_glm
+cp $SOURCE_DIR/bnews_99/bn99en_1.stm $dir/eval99_1_stm
+
+cp $SOURCE_DIR/bnews_99/en991231.glm $dir/eval99_2_glm
+cp $SOURCE_DIR/bnews_99/bn99en_2.stm $dir/eval99_2_stm
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
new file mode 100755
index 00000000000..9835d69a37e
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_corpus.sh
@@ -0,0 +1,63 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the North American News Text Corpus
+# https://catalog.ldc.upenn.edu/LDC95T21
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC95T21 data/local/data/na_news"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
+for x in $SOURCE_DIR/*/*/*; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+  d=$dir/${newspaper}_${year}
+
+  dir_list="$dir_list $d"
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
+done
+
+wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
new file mode 100755
index 00000000000..f7f810c2326
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/prepare_na_news_text_supplement.sh
@@ -0,0 +1,72 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+# This script prepares the North American News Text Supplement Corpus
+# https://catalog.ldc.upenn.edu/LDC98T30
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+nj=4
+cmd=run.pl
+
+. utils/parse_options.sh
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <SOURCE-DIR> <DIR>"
+  echo " e.g.: $0 /export/corpora/LDC/LDC98T30/northam_news_txt_sup data/local/data/na_news_supp"
+  exit 1
+fi
+
+SOURCE_DIR=$1
+dir=$2
+
+dir_list=
+
+rm -f $dir/.error 2>/dev/null
+
+for x in $SOURCE_DIR/nyt/*/ $SOURCE_DIR/latwp/ $SOURCE_DIR/apws/*/; do
+  year=`basename $x`
+  newspaper=`basename $(dirname $x)`
+
+  d=$dir/${newspaper}_${year}
+  
+  if [ $year == latwp ]; then
+    d=$dir/latwp_1997
+  elif [ $year == english ]; then
+    d=$dir/apws
+  fi
+
+  mkdir -p $d
+
+  dir_list="$dir_list $d"
+
+  list_file=$d/articles.list
+  ls $x/*.gz > $list_file
+  
+  mkdir -p $d/split$nj
+
+  eval utils/split_scp.pl $d/articles.list \
+    $d/split$nj/articles.list.{`seq -s, $nj`}
+
+  $cmd JOB=1:$nj $d/log/get_processed_text.JOB.log \
+    local/data_prep/process_na_news_text.py $d/split$nj/articles.list.JOB \
+    $d/corpus.JOB.gz || touch $dir/.error &
+done
+
+wait
+
+if [ -f $dir/.error ]; then
+  echo "$0: Failed to process files."
+fi
+
+for d in $dir_list; do
+  gunzip -c $d/corpus.*.gz | gzip -c > $d/corpus.gz || exit 1
+  rm $d/corpus.*.gz
+done
diff --git a/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
new file mode 100755
index 00000000000..be0c7ad8e0d
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_1995_bn_annotation.py
@@ -0,0 +1,273 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script process a 1995 CSR-IV annotation file and writes to
+utt2spk, segments and text files.
+"""
+
+from __future__ import print_function
+import argparse
+import os
+import logging
+import re
+from bs4 import BeautifulSoup
+import hub4_utils
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Get command-line arguments"""
+
+    parser = argparse.ArgumentParser("Process 1995 CSR-IV HUB4 transcripts")
+
+    parser.add_argument("--noise-word", type=str, default="<NOISE>",
+                        help="Word to add in-place of noise words")
+    parser.add_argument("--spoken-noise-word", type=str,
+                        default="<SPOKEN_NOISE>",
+                        help="Word to add in-place of speaker noise words")
+    parser.add_argument("in_file", type=argparse.FileType('r'),
+                        help="Input transcript file")
+    parser.add_argument("segments_file", type=argparse.FileType('a'),
+                        help="Output segments file")
+    parser.add_argument("utt2spk_file", type=argparse.FileType('a'),
+                        help="Output utt2spk file")
+    parser.add_argument("text_file", type=argparse.FileType('a'),
+                        help="Output text file")
+
+    args = parser.parse_args()
+    return args
+
+
+class Segment(object):
+    """Class to store an utterance (segment)"""
+
+    def __init__(self, reco_id, spk=None, start_time=-1,
+                 end_time=-2, text=""):
+        """The arguments are straight-forward.
+        spk can be None if speaker is not known, in which case the utterance-id
+        and speaker-id are made the same.
+        end_time can be -1 to mean the end of the recording.
+        """
+        self.reco_id = reco_id
+        self.spk = spk
+        self.start_time = float(start_time)
+        self.end_time = float(end_time)
+        self.text = text
+
+    def get_utt_id(self):
+        """Return the utterance-id, which is
+        <recording-id>-<start-frame>-<end-frame> if spk is not known.
+        Otherwise it is speaker-id is added as a suffix to <recording-id>
+        above.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}-{0:06d}-{1:06d}".format(
+            int(self.start_time * 100), int(self.end_time * 100),
+            reco=self.reco_id, spk=self.spk)
+
+    def get_spk_id(self):
+        """Returns the speaker-id appended to the recording-id, if speaker is
+        known. Otherwise returns the utterance-id as speaker-id.
+        """
+        if self.spk is None:
+            return "{reco}-{0:06d}-{1:06d}".format(
+                int(self.start_time * 100), int(self.end_time * 100),
+                reco=self.reco_id)
+        return "{reco}-{spk}".format(reco=self.reco_id, spk=self.spk)
+
+    def write_utt2spk(self, out_file):
+        """Writes this segment's entry into utt2spk file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.get_spk_id()),
+               file=out_file)
+
+    def write_segment(self, out_file):
+        """Writes this segment's entry into segments file."""
+        print ("{0} {1} {2:.3f} {3:.3f}".format(
+                    self.get_utt_id(), self.reco_id,
+                    self.start_time, self.end_time),
+               file=out_file)
+
+    def write_text(self, out_file):
+        """Writes this segment's entry into kaldi text file."""
+        print ("{0} {1}".format(self.get_utt_id(), self.text),
+               file=out_file)
+
+
+def write_segments(segments, args):
+    """Write segments with non-empty transcripts."""
+    for segment in segments:
+        if len(segment.text) == 0:
+            continue
+        segment.write_utt2spk(args.utt2spk_file)
+        segment.write_segment(args.segments_file)
+        segment.write_text(args.text_file)
+
+
+def process_text(text, noise_word, spoken_noise_word):
+    """Returns normalized text"""
+    text = re.sub(r"\[pause\]", "", text)
+    text = hub4_utils.normalize_csr_transcript(text, noise_word,
+                                               spoken_noise_word)
+    return text
+
+
+test_spk_matcher = re.compile(r"(\S+)\(bt=(\S+)\set=(\S+)\):\s(.+)$")
+train_spk_matcher = re.compile(r"(\S+):\s(.+)$")
+
+
+def process_story_content(args, reco_id, content,
+                          start_time, end_time):
+    """Process the contents in a story and converts into a set of segments.
+
+    Arguments:
+        args -- A reference to the CLI arguments
+        reco_id -- Recording id
+        content -- A string containing all the contents of a story (or the
+                   stuff before the story like the credits and announcements).
+                   It is split on a double-newline characters.
+        start_time -- Start time of this 'story'.
+        end_time -- End time of this 'story'.
+    """
+
+    segments = []
+    segment_tmp = Segment(reco_id=reco_id, spk=None,
+                          start_time=start_time, end_time=-2, text="")
+
+    for line in content.split('\n\n'):
+        line = re.sub('\n', ' ', line)
+
+        if len(line) == 0 or re.match(r"\[[^]]+\]$|\s*$", line):
+            continue
+
+        m = test_spk_matcher.match(line)
+        if m:
+            # A line of story in test file that has start and end times
+            # and speaker name.
+            spk = m.group(1)
+            bt = float(m.group(2))
+            et = float(m.group(3))
+
+            # Once we know the end-time of the temporary segment, we can
+            # write that out (Only if it is non-empty).
+            if len(segment_tmp.text) > 0:
+                segment_tmp.end_time = bt
+            segments.append(segment_tmp)
+            segment_tmp = Segment(reco_id, spk=None, start_time=et)
+
+            text = process_text(m.group(4), args.noise_word,
+                                args.spoken_noise_word)
+            if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+                continue
+            segments.append(Segment(reco_id=reco_id, spk=spk,
+                                    start_time=bt, end_time=et,
+                                    text=text))
+            continue
+
+        m = train_spk_matcher.match(line)
+        if m:
+            # A line of story in train file that has no time segment
+            # information. So speaker information is not useful.
+            text = process_text(m.group(2), args.noise_word,
+                                args.spoken_noise_word)
+        else:
+            # A line of story that does not have a speaker marking.
+            text = process_text(line, args.noise_word, args.spoken_noise_word)
+        if len(text) == 0 or re.match(r"\[[^]]+\]$|\s*$", text):
+            continue
+        segment_tmp.text += ' ' + text
+
+    if len(segment_tmp.text) > 0:
+        segment_tmp.end_time = end_time
+    segments.append(segment_tmp)
+
+    return segments
+
+
+def process_float(string):
+    string = re.sub(r"'|\"", "", string)
+    return float(string)
+
+
+def run(args):
+    base = os.path.basename(args.in_file.name)
+    reco_id = os.path.splitext(base)[0]
+
+    doc = ''.join(args.in_file.readlines())
+
+    soup = BeautifulSoup(doc, 'lxml')
+    for broadcast in soup.find_all('broadcast'):
+        non_story_contents = []
+        start_time = 0.0
+        end_time = -1.0
+        for s in broadcast.children:
+            try:
+                if s.name == 'story':
+                    story_begin_time = process_float(s['bt'])
+                    story_end_time = process_float(s['et'])
+                    for x in s.find_all('language') + s.find_all('sung'):
+                        x.replaceWithChildren()
+                    if len(non_story_contents):
+                        end_time = story_begin_time
+                        segments = process_story_content(
+                            args, reco_id, ' '.join(non_story_contents),
+                            start_time=start_time, end_time=end_time)
+                        write_segments(segments, args)
+                        non_story_contents = []
+                        start_time = story_end_time
+                    segments = process_story_content(
+                        args, reco_id,
+                        ' '.join([unicode(x) for x in s.children]),
+                        start_time=story_begin_time, end_time=story_end_time)
+                    write_segments(segments, args)
+                elif (s.name is not None and s.name != "language"
+                      and s.name != 'sung'):
+                    raise RuntimeError(
+                        "Expected a NavigableString or <story> "
+                        "or <language> or <sung>; got {0}".format(s))
+                elif s.name == "language" or s.name == "sung":
+                    non_story_contents.append(
+                        ' '.join([unicode(x) for x in s.children]))
+                else:
+                    non_story_contents.append(unicode(s))
+            except RuntimeError:
+                raise
+            except Exception:
+                logger.error("Failed to process broadcast children %s", s)
+                raise
+        # End for loop over broadcast children
+        if len(non_story_contents) > 0:
+            segments = process_story_content(
+                args, reco_id, ' '.join(non_story_contents),
+                start_time=start_time, end_time=-1)
+            write_segments(segments, args)
+
+
+def main():
+    try:
+        args = get_args()
+        run(args)
+    except Exception:
+        raise
+    finally:
+        for f in [args.in_file, args.segments_file,
+                  args.utt2spk_file, args.text_file]:
+            if f is not None:
+                f.close()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
new file mode 100755
index 00000000000..95aa7ddb831
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_1996_csr_hub4_lm_filelist.py
@@ -0,0 +1,165 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare CSR-IV 1996 Language model text corpus (LDC98T31)."""
+
+from __future__ import print_function
+import argparse
+import gzip
+import logging
+import os
+import re
+import subprocess
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Parses command-line arguments."""
+
+    parser = argparse.ArgumentParser("""Prepare CSR-IV 1996 Language model text
+    corpus (LDC98T31).""")
+    parser.add_argument("--verbose", choices=[0,1,2,3], type=int, default=0,
+                        help="Set higher for more verbose logging.")
+    parser.add_argument("file_list", type=str,
+                        help="""List of compressed source files""")
+    parser.add_argument("dir", type=str,
+                        help="Output directory to dump processed files to")
+
+    args = parser.parse_args()
+
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
+    return args
+
+
+def normalize_text(text, remove_punct=False):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
+    text1 = text.strip()
+    text2 = text1.upper()
+    text2 = re.sub(r" [ ]*", " ", text2)
+    text2 = re.sub(r"([A-Z][A-Z])[.!,;]\s", "\1", text2)  # remove punctuations
+    return text2
+
+
+def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
+
+    doc = re.sub(r"<s>", "<s></s>", ''.join(lines))
+    if doc == '':
+        return False
+
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for art in soup.html.body.children:
+        try:
+            if art.name != "art":
+                continue
+            for para in art.find_all('p'):
+                assert para.name == 'p'
+
+                for x in para.contents:
+                    try:
+                        if x.name is None:
+                            normalized_text = normalize_text(unicode(x))
+                            if len(normalized_text) == 0:
+                                continue
+                            out_file_handle.write("{0}\n".format(
+                                normalized_text.encode('ascii')))
+                            num_written += 1
+                    except Exception:
+                        logger.error("Failed to process content %s in para "
+                                     "%s", x, para)
+                        raise
+
+        except Exception:
+            try:
+                logger.error("Failed to process article %s", art['id'])
+            except AttributeError:
+                logger.error("Failed to process body content %s", art)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+    return True
+
+
+def _run(args):
+    """The one that does it all."""
+
+    for line in open(args.file_list).readlines():
+        try:
+            file_ = line.strip()
+            base_name = os.path.basename(file_)
+            name = os.path.splitext(base_name)[0]
+
+            out_file = gzip.open("{0}/{1}.txt.gz".format(args.dir, name),
+                                 'w')
+
+            logger.info("Running LM pipefile for |%s|...", base_name)
+
+            command = (
+                "gunzip -c {0} | "
+                "tools/csr4_utils/pare-sgml.perl | "
+                "perl tools/csr4_utils/bugproc.perl | "
+                "perl tools/csr4_utils/numhack.perl | "
+                "perl tools/csr4_utils/numproc.perl "
+                "  -xtools/csr4_utils/num_excp | "
+                "perl tools/csr4_utils/abbrproc.perl "
+                "  tools/csr4_utils/abbrlist | "
+                "perl tools/csr4_utils/puncproc.perl -np"
+                "".format(file_))
+            logger.debug("Running command '%s'", command)
+
+            p = subprocess.Popen(command,
+                                 stdout=subprocess.PIPE, shell=True)
+
+            stdout = p.communicate()[0]
+            if p.returncode is not 0:
+                logger.error(
+                    "Command '%s' failed with return status %d",
+                    command, p.returncode)
+                raise RuntimeError
+
+            if not process_file_lines(stdout, out_file):
+                logger.warn("File %s empty or could not be processed.",
+                            file_)
+        except Exception:
+            logger.error("Failed processing file %s", file_)
+            raise
+
+
+def main():
+    """The main function"""
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/data_prep/process_na_news_text.py b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
new file mode 100755
index 00000000000..94b02a766a9
--- /dev/null
+++ b/egs/hub4_english/s5/local/data_prep/process_na_news_text.py
@@ -0,0 +1,151 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""Prepare NA News Text Corpus (LDC95T21)
+or NA New Text Supplement Corpus (LDC98T30)."""
+
+from __future__ import print_function
+import argparse
+import gzip
+import logging
+import re
+import subprocess
+import sys
+
+from bs4 import BeautifulSoup
+
+sys.path.insert(0, 'local/data_prep')
+import hub4_utils
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def get_args():
+    """Parses command-line arguments."""
+
+    parser = argparse.ArgumentParser("Prepare NA News Text corpus (LDC95T21).")
+    parser.add_argument("--verbose", type=int, choices=[0, 1, 2, 3], default=0,
+                        help="Use larger verbosity for more verbose logging.")
+    parser.add_argument("file_list", type=str,
+                        help="List of compressed source files for NA News Text. "
+                        "e.g: /export/corpora/LDC/LDC95T21/na_news_1/latwp/1994")
+    parser.add_argument("out_file", type=str,
+                        help="Output file to write to.")
+
+    args = parser.parse_args()
+
+    if args.verbose > 2:
+        logger.setLevel(logging.DEBUG)
+        handler.setLevel(logging.DEBUG)
+
+    return args
+
+
+def normalize_text(text):
+    """Normalizes text and returns the normalized version.
+    The normalization involves converting text to upper case.
+    """
+    text1 = text.strip()
+    text2 = hub4_utils.remove_punctuations(text1)
+    text2 = text2.upper()
+    return text2
+
+
+def process_file_lines(lines, out_file_handle):
+    """Processes input lines from a file by removing SGML tags and
+    writes normalized plain text to output stream."""
+    doc = ''
+    for line in lines:
+        line = re.sub(r"<artID>([^</])+</DOCID>", "", line)
+        line = re.sub(r"<p>", "<p></p>", line)
+        doc += line
+
+    if doc == '':
+        return False
+
+    soup = BeautifulSoup(doc, 'lxml')
+
+    num_written = 0
+
+    for art in soup.html.body.children:
+        try:
+            if art.name != "art":
+                continue
+            for para in art.find_all('p'):
+                assert para.name == 'p'
+                text = ' '.join([unicode(x).strip() for x in para.contents])
+                normalized_text = normalize_text(text)
+                out_file_handle.write("{0}\n".format(
+                    normalized_text.encode('ascii')))
+                num_written += 1
+        except:
+            logger.error("Failed to process document %s", doc)
+            raise
+    if num_written == 0:
+        raise RuntimeError("0 sentences written.")
+    return True
+
+
+def _run(args):
+    """The one that does it all."""
+
+    with gzip.open(args.out_file, 'w') as writer:
+        for line in open(args.file_list).readlines():
+            try:
+                file_ = line.strip()
+                command = (
+                    "gunzip -c {0} | "
+                    "tools/csr4_utils/pare-sgml.perl | "
+                    "perl tools/csr4_utils/bugproc.perl | "
+                    "perl tools/csr4_utils/numhack.perl | "
+                    "perl tools/csr4_utils/numproc.perl "
+                    "  -xtools/csr4_utils/num_excp | "
+                    "perl tools/csr4_utils/abbrproc.perl "
+                    "  tools/csr4_utils/abbrlist | "
+                    "perl tools/csr4_utils/puncproc.perl -np"
+                    "".format(file_))
+                logger.debug("Running command '%s'", command)
+
+                p = subprocess.Popen(command,
+                                     stdout=subprocess.PIPE, shell=True)
+
+                stdout = p.communicate()[0]
+                if p.returncode is not 0:
+                    logger.error(
+                        "Command '%s' failed with return status %d",
+                        command, p.returncode)
+                    raise RuntimeError
+
+                if not process_file_lines(stdout, writer):
+                    logger.warn("File %s empty or could not be processed.",
+                                file_)
+            except Exception:
+                logger.error("Failed processing file %s", file_)
+                raise
+
+
+def main():
+    """The main function"""
+    try:
+        args = get_args()
+        _run(args)
+    except Exception:
+        logger.error("Failed to process all files", exc_info=True)
+        sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/dict b/egs/hub4_english/s5/local/dict
new file mode 120000
index 00000000000..384304fdf2a
--- /dev/null
+++ b/egs/hub4_english/s5/local/dict
@@ -0,0 +1 @@
+../../../wsj/s5/local/dict/
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/format_data.sh b/egs/hub4_english/s5/local/format_data.sh
new file mode 100755
index 00000000000..98e7eda08ab
--- /dev/null
+++ b/egs/hub4_english/s5/local/format_data.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0.
+
+set -e
+set -o pipefail
+
+echo "$0 $@"  # Print the command line for logging
+
+noise_word="<NOISE>"
+spoken_noise_word="<SPOKEN_NOISE>"
+
+. utils/parse_options.sh || exit 1;
+
+. ./path.sh || exit 1;
+
+if [ $# -ne 0 ]; then
+  echo "Usage: $0"
+  exit 1
+fi
+
+srcdir=data/local/data
+tmpdir=data/local/
+
+export PATH=$PATH:$KALDI_ROOT/tools/sph2pipe_v2.5
+
+###############################################################################
+# Format 1996 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn96
+
+local/data_prep/format_1996_bn_data.pl \
+  $srcdir/train_bn96/audio.list $srcdir/train_bn96/transcripts.txt \
+  data/train_bn96 || exit 1
+
+mv data/train_bn96/text data/train_bn96/text.unnorm
+local/data_prep/normalize_bn96_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn96/text.unnorm > data/train_bn96/text
+
+###############################################################################
+# Format 1997 English Broadcast News Train (HUB4)
+###############################################################################
+mkdir -p data/train_bn97
+
+local/data_prep/format_1997_bn_data.pl \
+  $srcdir/train_bn97/audio.list $srcdir/train_bn97/transcripts.txt \
+  data/train_bn97 || exit 1
+
+mv data/train_bn97/text data/train_bn97/text.unnorm
+local/data_prep/normalize_bn97_transcripts.pl $noise_word $spoken_noise_word \
+  < data/train_bn97/text.unnorm > data/train_bn97/text
+
+###############################################################################
+# Format 1996 English Broadcast News Dev (HUB4)
+###############################################################################
+mkdir -p data/dev96pe 
+mkdir -p data/dev96ue
+
+cp $srcdir/hub4_96_dev_eval/dev96_uem_segments data/dev96ue/segments
+cp $srcdir/hub4_96_dev_eval/dev96_uem_utt2spk data/dev96ue/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_uem_wav_scp data/dev96ue/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_uem_stm data/dev96ue/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96ue/glm
+
+awk '{if ($4 > $3) print $0}' $srcdir/hub4_96_dev_eval/dev96_pem_segments \
+  > data/dev96pe/segments
+cp $srcdir/hub4_96_dev_eval/dev96_pem_utt2spk data/dev96pe/utt2spk
+cp $srcdir/hub4_96_dev_eval/dev96_pem_wav_scp data/dev96pe/wav.scp
+cp $srcdir/hub4_96_dev_eval/dev96_pem_stm data/dev96pe/stm
+cp $srcdir/hub4_96_dev_eval/glm data/dev96pe/glm
+
+###############################################################################
+# Format 1996 English Broadcast News Eval (HUB4)
+###############################################################################
+mkdir -p data/eval96
+mkdir -p data/eval96.pem 
+
+cp $srcdir/hub4_96_dev_eval/eval96_pem_segments data/eval96.pem/segments
+cp $srcdir/hub4_96_dev_eval/eval96_pem_utt2spk data/eval96.pem/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96.pem/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96.pem/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96.pem/glm
+
+cp $srcdir/hub4_96_dev_eval/eval96_uem_segments data/eval96/segments
+cp $srcdir/hub4_96_dev_eval/eval96_uem_utt2spk data/eval96/utt2spk
+cp $srcdir/hub4_96_dev_eval/eval96_wav_scp data/eval96/wav.scp
+cp $srcdir/hub4_96_dev_eval/eval96_stm data/eval96/stm
+cp $srcdir/hub4_96_dev_eval/glm data/eval96/glm
+
+###############################################################################
+# Format 1997-98 Hub4 Broadcast news evalutation
+###############################################################################
+for t in eval97 eval98; do
+  mkdir -p data/$t data/${t}.pem
+  cp $srcdir/$t/segments data/$t/segments
+  cp $srcdir/$t/utt2spk data/$t/utt2spk
+  cp $srcdir/$t/segments.pem data/${t}.pem/segments
+  cp $srcdir/$t/utt2spk.pem data/${t}.pem/utt2spk
+  cp $srcdir/$t/wav.scp data/$t/wav.scp
+  cp $srcdir/$t/wav.scp data/${t}.pem/wav.scp
+  cp $srcdir/$t/stm data/$t/stm
+  cp $srcdir/$t/stm data/${t}.pem/stm
+  cp $srcdir/$t/glm data/$t/glm
+  cp $srcdir/$t/glm data/${t}.pem/glm
+done
+
+###############################################################################
+# Format 1999 Hub4 Broadcast news evalutation
+###############################################################################
+for d in eval99_1 eval99_2; do
+  mkdir -p data/${d} data/${d}.pem
+  cp $srcdir/eval99/${d}_uem_segments data/${d}/segments
+  cp $srcdir/eval99/${d}_uem_utt2spk data/${d}/utt2spk
+  cp $srcdir/eval99/${d}_pem_segments data/${d}.pem/segments
+  cp $srcdir/eval99/${d}_pem_utt2spk data/${d}.pem/utt2spk
+  cp $srcdir/eval99/${d}_wav_scp data/${d}/wav.scp
+  cp $srcdir/eval99/${d}_wav_scp data/${d}.pem/wav.scp
+  cp $srcdir/eval99/${d}_stm data/${d}/stm
+  cp $srcdir/eval99/${d}_stm data/${d}.pem/stm
+  cp $srcdir/eval99/${d}_glm data/${d}/glm
+  cp $srcdir/eval99/${d}_glm data/${d}.pem/glm
+done
+
+for d in train_bn96 train_bn97 eval96 eval96.pem dev96pe dev96ue eval97 eval97.pem \
+         eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem; do
+  utils/utt2spk_to_spk2utt.pl data/$d/utt2spk > data/$d/spk2utt
+  awk '{print $1" "$1" 1"}' data/${d}/wav.scp > \
+    data/${d}/reco2file_and_channel
+  utils/fix_data_dir.sh data/${d}
+done
+
+utils/combine_data.sh data/train data/train_bn96 data/train_bn97
diff --git a/egs/hub4_english/s5/local/format_lms.sh b/egs/hub4_english/s5/local/format_lms.sh
new file mode 100755
index 00000000000..1d18209aa60
--- /dev/null
+++ b/egs/hub4_english/s5/local/format_lms.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+#
+# Copyright  2014 Nickolay V. Shmyrev
+# Apache 2.0
+
+[ -f ./path.sh ] && . ./path.sh
+
+set -e -o pipefail -u
+
+lang_suffix=_test
+local_lm_dir=data/local/local_lm
+
+. utils/parse_options.sh
+
+#arpa_lm=$local_lm_dir/data/arpa/4gram.arpa.gz
+small_arpa_lm=$local_lm_dir/data/arpa/4gram_small.arpa.gz
+big_arpa_lm=$local_lm_dir/data/arpa/4gram_big.arpa.gz
+
+for f in $small_arpa_lm $big_arpa_lm data/lang_nosp/words.txt; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+set -e
+
+cp -rT data/lang_nosp/ data/lang_nosp${lang_suffix}
+
+if [ -f data/lang_nosp${lang_suffix}/G.fst ] && [ data/lang_nosp${lang_suffix}/G.fst -nt $small_arpa_lm ]; then
+  echo "$0: not regenerating data/lang_nosp${lang_suffix}/G.fst as it already exists and "
+  echo ".. is newer than the source LM."
+else
+  arpa2fst --disambig-symbol=#0 --read-symbol-table=data/lang_nosp/words.txt \
+    "gunzip -c $small_arpa_lm|" data/lang_nosp${lang_suffix}/G.fst
+  echo  "$0: Checking how stochastic G is (the first of these numbers should be small):"
+  fstisstochastic data/lang_nosp${lang_suffix}/G.fst || true
+  utils/validate_lang.pl --skip-determinization-check data/lang_nosp${lang_suffix}
+fi
+
+
+if [ -f data/lang_nosp${lang_suffix}_rescore/G.carpa ] && [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt $big_arpa_lm ] && \
+    [ data/lang_nosp${lang_suffix}_rescore/G.carpa -nt data/lang_nosp/words.txt ]; then
+  echo "$0: not regenerating data/lang_nosp_rescore/ as it seems to already by up to date."
+else
+  utils/build_const_arpa_lm.sh $big_arpa_lm data/lang_nosp \
+    data/lang_nosp${lang_suffix}_rescore || exit 1;
+fi
+
+exit 0;
diff --git a/egs/hub4_english/s5/local/lm/merge_word_counts.py b/egs/hub4_english/s5/local/lm/merge_word_counts.py
new file mode 100755
index 00000000000..6338cbbf875
--- /dev/null
+++ b/egs/hub4_english/s5/local/lm/merge_word_counts.py
@@ -0,0 +1,30 @@
+#! /usr/bin/env python
+
+# Copyright 2016    Vimal Manohar
+# Apache 2.0.
+
+"""This script merges pocolm word_counts and write a new word_counts file.
+A min-count argument is required to only write counts that are above the
+specified minimum count.
+"""
+
+import sys
+
+
+def main():
+    if len(sys.argv) != 2:
+        sys.stderr.write("Usage: {0} <min-count>\n".format(sys.argv[0]))
+        raise SystemExit(1)
+
+    words = {}
+    for line in sys.stdin.readlines():
+        parts = line.strip().split()
+        words[parts[1]] = words.get(parts[1], 0) + int(parts[0])
+
+    for word, count in words.iteritems():
+        if count >= int(sys.argv[1]):
+            print ("{0} {1}".format(count, word))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/egs/hub4_english/s5/local/normalize_transcripts.pl b/egs/hub4_english/s5/local/normalize_transcripts.pl
new file mode 120000
index 00000000000..5f1261ccd79
--- /dev/null
+++ b/egs/hub4_english/s5/local/normalize_transcripts.pl
@@ -0,0 +1 @@
+data_prep/normalize_bn96_transcripts.pl
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/prepare_dict.sh b/egs/hub4_english/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..3f53ec6af74
--- /dev/null
+++ b/egs/hub4_english/s5/local/prepare_dict.sh
@@ -0,0 +1,146 @@
+#!/bin/bash
+
+# Copyright 2010-2012 Microsoft Corporation
+#           2012-2014 Johns Hopkins University (Author: Daniel Povey)
+#                2015 Guoguo Chen
+#                2016 Vimal Manohar
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# Call this script from one level above, e.g. from the s3/ directory.  It puts
+# its output in data/local/.
+
+# The parts of the output of this that will be needed are
+# [in data/local/dict/ ]
+# lexicon.txt
+# extra_questions.txt
+# nonsilence_phones.txt
+# optional_silence.txt
+# silence_phones.txt
+
+[ -f ./path.sh ] && . ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+# run this from ../
+dict_suffix=
+stage=-1
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 <wordlist>"
+  echo "e.g. : $0 data/local/local_lm/data/work/wordlist"
+  exit 1
+fi
+
+wordlist=$1
+
+dir=data/local/dict${dict_suffix}
+mkdir -p $dir
+
+if [ ! -d $dir/cmudict ]; then
+  # (1) Get the CMU dictionary
+  svn co  https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \
+    $dir/cmudict || exit 1;
+fi
+
+cp $wordlist $dir/orig_wordlist
+
+# can add -r 10966 for strict compatibility.
+
+#(2) Dictionary preparation:
+
+
+if [ $stage -le 0 ]; then
+  # Make phones symbol-table (adding in silence and verbal and non-verbal noises at this point).
+  # We are adding suffixes _B, _E, _S for beginning, ending, and singleton phones.
+
+  # silence phones, one per line.
+  (echo SIL; echo SPN; echo NSN; echo UNK;) > $dir/silence_phones.txt
+  echo SIL > $dir/optional_silence.txt
+
+  # nonsilence phones; on each line is a list of phones that correspond
+  # really to the same base phone.
+  cat $dir/cmudict/cmudict.0.7a.symbols | perl -ane 's:\r::; print;' | \
+    perl -e 'while(<>){
+  chop; m:^([^\d]+)(\d*)$: || die "Bad phone $_";
+  $phones_of{$1} .= "$_ "; }
+  foreach $list (values %phones_of) {print $list . "\n"; } ' \
+    > $dir/nonsilence_phones.txt || exit 1;
+
+  # A few extra questions that will be added to those obtained by automatically clustering
+  # the "real" phones.  These ask about stress; there's also one for silence.
+  cat $dir/silence_phones.txt| awk '{printf("%s ", $1);} END{printf "\n";}' > $dir/extra_questions.txt || exit 1;
+  cat $dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", $_)) {
+  $p =~ m:^([^\d]+)(\d*)$: || die "Bad phone $_"; $q{$2} .= "$p "; } } foreach $l (values %q) {print "$l\n";}' \
+    >> $dir/extra_questions.txt || exit 1;
+
+  grep -v ';;;' $dir/cmudict/cmudict.0.7a | \
+    perl -ane 'if(!m:^;;;:){ s:(\S+)\(\d+\) :$1 :; print; }' \
+    > $dir/dict.cmu || exit 1;
+
+  # Add to cmudict the silences, noises etc.
+
+  (echo '!SIL SIL'; echo '<SPOKEN_NOISE> SPN'; echo '<unk> UNK'; echo '<NOISE> NSN'; ) | \
+    cat - $dir/dict.cmu > $dir/lexicon2_raw.txt
+  awk '{print $1}' $dir/lexicon2_raw.txt > $dir/wordlist_with_prons
+
+  cat <<EOF >$dir/silence_phones.txt
+SIL
+SPN
+NSN
+UNK
+EOF
+
+fi
+
+
+if [ $stage -le 2 ]; then
+  if [ ! -f exp/g2p/.done ]; then
+    steps/dict/train_g2p.sh --cmd "$train_cmd" \
+      --silence-phones $dir/silence_phones.txt \
+      $dir/dict.cmu exp/g2p
+    touch exp/g2p/.done
+  fi
+fi
+
+export PATH=$PATH:`pwd`/local/dict
+
+if [ $stage -le 3 ]; then
+  utils/filter_scp.pl --exclude $dir/wordlist_with_prons < $dir/orig_wordlist | \
+    sort -u > $dir/oovlist
+fi
+
+if [ $stage -le 7 ]; then
+  steps/dict/apply_g2p.sh --cmd "$train_cmd" \
+    $dir/oovlist exp/g2p exp/g2p/oov_lex
+  cat exp/g2p/oov_lex/lexicon.lex | cut -f 1,3 | awk '{if (NF > 1) print $0}' > \
+    $dir/dict.oovs_g2p
+fi
+
+if [ $stage -le 8 ]; then
+  # the sort | uniq is to remove a duplicated pron from cmudict.
+  cat $dir/lexicon2_raw.txt $dir/dict.oovs_g2p | sort | uniq > \
+    $dir/lexicon.txt || exit 1;
+  # lexicon.txt is without the _B, _E, _S, _I markers.
+
+  rm $dir/lexiconp.txt 2>/dev/null || true
+fi
+
+echo "Dictionary preparation succeeded"
diff --git a/egs/hub4_english/s5/local/run_cleanup_segmentation.sh b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
new file mode 100755
index 00000000000..e91ec318650
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_cleanup_segmentation.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+#           2016  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+# This script demonstrates how to re-segment training data selecting only the
+# "good" audio that matches the transcripts.
+# The basic idea is to decode with an existing in-domain acoustic model, and a
+# biased language model built from the reference, and then work out the
+# segmentation from a ctm like file.
+
+stage=0
+cleanup_stage=0
+data=data/train
+cleanup_affix=cleaned
+srcdir=exp/tri3
+nj=100
+decode_nj=16
+decode_num_threads=4
+
+. ./path.sh
+. ./cmd.sh
+
+set -e
+set -o pipefail
+set -u
+
+. utils/parse_options.sh
+
+cleaned_data=${data}_${cleanup_affix}
+
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 1 ]; then
+  # This does the actual data cleanup.
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj $nj --cmd "$train_cmd" \
+    $data data/lang_nosp $srcdir $dir $cleaned_data
+fi
+
+if [ $stage -le 2 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 3 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4200 40000 $cleaned_data data/lang_nosp ${srcdir}_ali_${cleanup_affix} ${cleaned_dir}
+fi
+
+if [ $stage -le 4 ]; then
+  # Test with the model trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd" \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp ${cleaned_dir} ${cleaned_dir}_ali_${cleanup_affix}
+fi
+
+if [ $stage -le 6 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${cleaned_dir}_ali_${cleanup_affix} exp/tri4_${cleanup_affix}
+fi
+
+cleaned_dir=exp/tri4_${cleanup_affix}
+if [ $stage -le 7 ]; then
+  # Test with the larger model trained on cleaned-up data.
+  utils/mkgraph.sh data/lang_nosp_test ${cleaned_dir} ${cleaned_dir}/graph_nosp
+
+  for dset in eval97.pem eval98.pem eval99_1.pem eval99_2.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt $decode_nj ]; then
+      this_nj=$decode_nj
+    fi
+    steps/decode_fmllr.sh --nj $decode_nj --num-threads $decode_num_threads \
+       --cmd "$decode_cmd"  \
+       ${cleaned_dir}/graph_nosp data/${dset} ${cleaned_dir}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang_nosp_test data/lang_nosp_test_rescore \
+       data/${dset} ${cleaned_dir}/decode_nosp_${dset} ${cleaned_dir}/decode_nosp_${dset}_rescore
+  done
+fi
diff --git a/egs/hub4_english/s5/local/run_segmentation_wsj.sh b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
new file mode 100755
index 00000000000..a321abe9a29
--- /dev/null
+++ b/egs/hub4_english/s5/local/run_segmentation_wsj.sh
@@ -0,0 +1,313 @@
+#!/bin/bash
+
+# Copyright 2016-18  Vimal Manohar
+# Apache 2.0
+
+set -e
+set -o pipefail
+
+# This script demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models. 
+
+# The overall procedure is as follow:
+# 1) Train a GMM on out-of-domain WSJ corpus
+# 2) Decode broadcast news recordings (HUB4) with WSJ GMM and 4-gram biased LM 
+#    trained on the raw unprocessed transcript. 
+# 3) Use the CTM output to segment the recordings keep the best matched
+#    audio and text.
+# 4) Train an in-domain GMM on the above data. 
+# 5) Repeat steps 2, 3 and 4 using the new in-domain GMM.
+# 6) Re-segment the data retaining only the "clean" part of the data.
+
+# See the script steps/cleanup/segment_long_utterances.sh for details about 
+# audio-transcript alignment (Step 2, 3)
+# See the script steps/cleanup/clean_and_segment_data.sh for details about 
+# cleaning up transcripts (Step 6)
+
+# In step 3, if you need to align the full hypothesis of audio with the 
+# reference text as opposed to finding the best matching substring, 
+# then use --align-full-hyp true in the scripts below.
+
+# WSJ models (From step 1)
+# %WER 29.9 | 728 32834 | 72.9 17.8 9.3 2.8 29.9 92.7 | exp/wsj_tri3/decode_nosp_eval97.pem_rescore/score_16_0.0/eval97.pem.ctm.filt.sys
+# %WER 30.8 | 728 32834 | 71.8 18.4 9.8 2.6 30.8 92.3 | exp/wsj_tri3/decode_nosp_eval97.pem/score_17_0.0/eval97.pem.ctm.filt.sys
+
+# In-domain GMM (From step 4) -- 107 hrs
+# %WER 19.1 | 728 32834 | 82.7 12.2 5.1 1.9 19.1 86.4 | exp/tri4_a/decode_nosp_eval97.pem_rescore/score_14_1.0/eval97.pem.ctm.filt.sys
+# %WER 20.4 | 728 32834 | 81.6 13.1 5.3 2.1 20.4 87.4 | exp/tri4_a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+
+# Stage 2 in-domain GMM (From step 5) -- 124 hrs
+# %WER 20.9 | 728 32834 | 81.2 13.6 5.3 2.1 20.9 87.4 | exp/tri4_2a/decode_nosp_eval97.pem/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.8 | 728 32834 | 82.3 12.9 4.7 2.2 19.8 86.1 | exp/tri4_2a/decode_nosp_eval97.pem_rescore/score_12_0.5/eval97.pem.ctm.filt.sys
+
+# GMM trained on cleaned transcripts (From step 6) -- 120 hrs
+# %WER 18.4 | 728 32834 | 83.6 11.9 4.5 2.1 18.4 84.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem_rescore/score_13_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.6 | 728 32834 | 82.5 12.7 4.8 2.2 19.6 86.8 | exp/tri5_2a_cleaned/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# Oracle HUB4 transcripts -- 148 hrs
+# %WER 17.8 | 728 32834 | 84.1 11.8 4.1 1.9 17.8 82.8 | exp/tri4/decode_nosp_eval97.pem_rescore/score_13_0.5/eval97.pem.ctm.filt.sys
+# %WER 19.0 | 728 32834 | 83.0 12.7 4.3 2.0 19.0 84.2 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+stage=0
+segment_stage=-8
+nj=40
+reco_nj=80
+stage1_affix=a    # For steps 2, 3 and 4 above
+stage2_affix=2a   # For step 5 above
+
+# WSJ run.sh must be run until the data preparation stage
+wsj_base=../../wsj/s5   # Change this to the WSJ base directory
+
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./cmd.sh
+
+. utils/parse_options.sh
+
+if [ ! -f $wsj_base/data/train_si284/wav.scp ]; then
+  echo "WSJ data directory $wsj_base/data/train_si284 is not prepared."
+  echo "Run the initial stages of WSJ's run.sh"
+  exit 0
+fi
+
+if [ $stage -le 0 ]; then
+  # We copy the prepared data to the current directory
+  utils/copy_data_dir.sh $wsj_base/data/train_si84_2kshort data/train_si84_2kshort
+  utils/copy_data_dir.sh $wsj_base/data/train_si84 data/train_si84
+  utils/copy_data_dir.sh $wsj_base/data/train_si284 data/train_si284
+fi
+
+###############################################################################
+## Simulate unsegmented HUB4 data directory.
+###############################################################################
+
+if [ $stage -le 1 ]; then
+  utils/data/convert_data_dir_to_whole.sh data/train data/train_long
+
+  steps/make_mfcc.sh --cmd "$train_cmd --max-jobs-run 40" \
+    --nj $reco_nj --write-utt2num-frames true \
+    data/train_long exp/make_mfcc/train_long mfcc
+  steps/compute_cmvn_stats.sh data/train_long \
+    exp/make_mfcc/train_long mfcc
+  utils/fix_data_dir.sh data/train_long
+fi
+
+###############################################################################
+## Train GMM on out-of-domain WSJ corpus 
+###############################################################################
+
+if [ $stage -le 2 ]; then
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84_2kshort data/lang_nosp exp/wsj_mono0a
+fi
+
+if [ $stage -le 3 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a exp/wsj_mono0a_ali_si84
+
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2500 15000 \
+    data/train_si84 data/lang_nosp exp/wsj_mono0a_ali_si84 exp/wsj_tri1
+fi
+
+if [ $stage -le 4 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri1 exp/wsj_tri1_ali_si284
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri1_ali_si284 exp/wsj_tri2
+fi
+
+if [ $stage -le 5 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train_si284 data/lang_nosp exp/wsj_tri2 exp/wsj_tri2_ali_si284
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    4000 42000 \
+    data/train_si284 data/lang_nosp exp/wsj_tri2_ali_si284 exp/wsj_tri3
+fi
+
+if [ $stage -le 6 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/wsj_tri3/{,graph_nosp_test}
+
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/wsj_tri3/graph_nosp_test data/$dset \
+      exp/wsj_tri3/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/wsj_tri3/decode_nosp_${dset} \
+      exp/wsj_tri3/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use a SAT model trained on train_si284 (wsj_tri3) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 7 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/wsj_tri3 data/lang_nosp data/train_long \
+    data/train_reseg_${stage1_affix} exp/segment_long_utts_${stage1_affix}_train
+fi
+
+if [ $stage -le 8 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix} \
+    exp/make_mfcc/train_reseg_${stage1_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}
+
+  utils/data/modify_speaker_info.sh data/train_reseg_${stage1_affix} \
+    data/train_reseg_${stage1_affix}_spk30sec
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage1_affix}_spk30sec \
+    exp/make_mfcc/train_reseg_${stage1_affix}_spk30sec mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage1_affix}_spk30sec
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 9 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3 exp/wsj_tri3_ali_train_reseg_${stage1_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp \
+    exp/wsj_tri3_ali_train_reseg_${stage1_affix} exp/tri3_${stage1_affix} 
+fi
+
+if [ $stage -le 10 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix} exp/tri3_${stage1_affix}_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train_reseg_${stage1_affix}_spk30sec data/lang_nosp exp/tri3_${stage1_affix}_ali exp/tri4_${stage1_affix}
+fi
+
+if [ $stage -le 11 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage1_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage1_affix}/graph_nosp_test data/$dset exp/tri4_${stage1_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage1_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage1_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Segment long HUB4 recordings and retrieve transcript using 
+# Smith-Waterman alignment.
+# Use in-domain SAT model (tri4_a) as seed model for decoding.
+###############################################################################
+
+if [ $stage -le 12 ]; then
+  steps/cleanup/segment_long_utterances.sh --cmd "$train_cmd" \
+    --stage $segment_stage --nj $reco_nj \
+    --max-bad-proportion 0.5 --align-full-hyp false \
+    exp/tri4_${stage1_affix} data/lang_nosp data/train_long \
+    data/train_reseg_${stage2_affix} exp/segment_long_utts_${stage2_affix}_train
+fi
+
+if [ $stage -le 13 ]; then
+  steps/compute_cmvn_stats.sh data/train_reseg_${stage2_affix} \
+    exp/make_mfcc/train_reseg_${stage2_affix} mfcc
+  utils/fix_data_dir.sh data/train_reseg_${stage2_affix}
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on retrieved transcripts.
+###############################################################################
+
+if [ $stage -le 14 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix} exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    exp/tri4_${stage1_affix}_ali_train_reseg_${stage2_affix} exp/tri4_${stage2_affix} 
+fi
+
+if [ $stage -le 15 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4_${stage2_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4_${stage2_affix}/graph_nosp_test data/$dset exp/tri4_${stage2_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4_${stage2_affix}/decode_nosp_${dset} \
+      exp/tri4_${stage2_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+###############################################################################
+# Cleanup transcripts
+# Use in-domain SAT model (tri4_2a) as seed model for decoding.
+###############################################################################
+
+cleanup_stage=-1
+cleanup_affix=cleaned
+srcdir=exp/tri4_${stage2_affix}
+cleaned_data=data/train_reseg_${stage2_affix}_${cleanup_affix}
+dir=${srcdir}_${cleanup_affix}_work
+cleaned_dir=${srcdir}_${cleanup_affix}
+
+if [ $stage -le 16 ]; then
+  steps/cleanup/clean_and_segment_data.sh --stage $cleanup_stage --nj 80 \
+    --cmd "$train_cmd" \
+    data/train_reseg_${stage2_affix} data/lang_nosp \
+    $srcdir $dir $cleaned_data
+fi
+
+###############################################################################
+# Train new in-domain GMM (tri4_2a) on cleaned-up transcripts.
+###############################################################################
+
+if [ $stage -le 17 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $cleaned_data data/lang_nosp $srcdir ${srcdir}_ali_${cleanup_affix}
+
+  steps/train_sat.sh --cmd "$train_cmd" \
+    5000 100000 $cleaned_data data/lang_nosp \
+    ${srcdir}_ali_${cleanup_affix} exp/tri5_${stage2_affix}_${cleanup_affix}
+fi
+
+if [ $stage -le 18 ]; then
+  utils/mkgraph.sh data/lang_nosp_test \
+    exp/tri5_${stage2_affix}_${cleanup_affix}/{,graph_nosp_test}
+  for dset in eval97.pem; do
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/graph_nosp_test data/$dset \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset} \
+      exp/tri5_${stage2_affix}_${cleanup_affix}/decode_nosp_${dset}_rescore
+  done
+fi
+
+exit 0
diff --git a/egs/hub4_english/s5/local/score.sh b/egs/hub4_english/s5/local/score.sh
new file mode 120000
index 00000000000..d89286dc25a
--- /dev/null
+++ b/egs/hub4_english/s5/local/score.sh
@@ -0,0 +1 @@
+score_sclite.sh
\ No newline at end of file
diff --git a/egs/hub4_english/s5/local/score_sclite.sh b/egs/hub4_english/s5/local/score_sclite.sh
new file mode 100755
index 00000000000..add014c2dcc
--- /dev/null
+++ b/egs/hub4_english/s5/local/score_sclite.sh
@@ -0,0 +1,111 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+min_lmwt=5
+max_lmwt=17
+iter=final
+word_ins_penalty=0.0,0.5,1.0
+resolve_ctm_overlaps=false
+#end configuration section.
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: local/score_sclite.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+dir=$3
+
+model=$dir/../$iter.mdl # assume model one level up from decoding dir.
+
+hubscr=$KALDI_ROOT/tools/sctk/bin/hubscr.pl
+[ ! -f $hubscr ] && echo "Cannot find scoring program at $hubscr" && exit 1;
+hubdir=`dirname $hubscr`
+
+for f in $data/stm $data/glm $lang/words.txt $lang/phones/word_boundary.int \
+     $model $data/segments $data/reco2file_and_channel $dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+if [ -f $dir/../frame_shift ]; then
+  frame_shift_opt="--frame-shift=$(cat $dir/../frame_shift)"
+  echo "$0: $dir/../frame_shift exists, using $frame_shift_opt"
+elif [ -f $dir/../frame_subsampling_factor ]; then
+  factor=$(cat $dir/../frame_subsampling_factor) || exit 1
+  frame_shift_opt="--frame-shift=0.0$factor"
+  echo "$0: $dir/../frame_subsampling_factor exists, using $frame_shift_opt"
+fi
+
+name=`basename $data`; # e.g. eval2000
+
+mkdir -p $dir/scoring/log
+
+if [ $stage -le 0 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/get_ctm.LMWT.${wip}.log \
+      mkdir -p $dir/score_LMWT_${wip}/ '&&' \
+      lattice-scale --lm-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+      lattice-1best ark:- ark:- \| \
+      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+      nbest-to-ctm $frame_shift_opt ark:- - \| \
+      utils/int2sym.pl -f 5 $lang/words.txt '>' \
+      $dir/score_LMWT_${wip}/$name.utt_ctm || exit 1;
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    if $resolve_ctm_overlaps; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/resolve_ctm_overlaps.LMWT.${wip}.log \
+        utils/ctm/resolve_ctm_overlaps.py $data/segments \
+          $dir/score_LMWT_${wip}/$name.utt_ctm - \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/convert_ctm.LMWT.${wip}.log \
+        cat $dir/score_LMWT_${wip}/$name.utt_ctm \| \
+        utils/convert_ctm.pl $data/segments $data/reco2file_and_channel \
+        '>' $dir/score_LMWT_${wip}/$name.ctm || exit 1;
+    fi
+  done
+fi
+
+if [ $stage -le 2 ]; then
+  # Remove some stuff we don't want to score, from the ctm.
+  # the big expression in parentheses contains all the things that get mapped
+  # by the glm file, into hesitations.
+  # The -$ expression removes partial words.
+  # the aim here is to remove all the things that appear in the reference as optionally
+  # deletable (inside parentheses), as if we delete these there is no loss, while
+  # if we get them correct there is no gain.
+  for x in $dir/score_*/$name.ctm; do
+    cp $x $dir/tmpf;
+    cat $dir/tmpf | grep -i -v -E '<NOISE|SPOKEN_NOISE>' | \
+    grep -i -v -E ' (UH|UM|EH|MM|HM|AH|HUH|HA|ER|OOF|HEE|ACH|EEE|EW)$' | \
+    grep -v -- '-$' > $x;
+  done
+fi
+
+# Score the set...
+if [ $stage -le 3 ]; then
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.${wip}.log \
+      cp $data/stm $dir/score_LMWT_${wip}/ '&&' \
+      $hubscr -p $hubdir -V -l english -h hub4 -g $data/glm -r $dir/score_LMWT_${wip}/stm $dir/score_LMWT_${wip}/${name}.ctm || exit 1;
+  done
+fi
+
+exit 0
diff --git a/egs/hub4_english/s5/local/train_lm.sh b/egs/hub4_english/s5/local/train_lm.sh
new file mode 100755
index 00000000000..4378a287d42
--- /dev/null
+++ b/egs/hub4_english/s5/local/train_lm.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Copyright 2016  Vimal Manohar
+# Apache 2.0
+#
+# This script trains a LM on the Broadcast News transcripts.
+# It is based on the example scripts distributed with PocoLM.
+
+# It will first check if pocolm is installed and if not will process with installation
+
+
+set -e
+set -o pipefail 
+set -u
+
+stage=0
+dir=data/local/local_lm
+cmd=run.pl
+vocab_size=   # Preferred vocabulary size
+
+echo "$0 $@"  # Print the command line for logging
+. utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+num_dev_sentences=4500
+RANDOM=0  # set seed for shuffling to ensure reproducibility
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Take unique subset to make sure that the training text is not in the 
+  # dev set.
+  # Replace train with train_bn96 in order to use only the 1996 HUB4 set
+  cat data/train/text | cut -d ' ' -f 2- | sort | uniq -c | \
+    shuf > ${dir}/train_text_with_count
+  head -n $num_dev_sentences < ${dir}/train_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
+    ${dir}/data/text/dev.txt 
+  tail -n +$[num_dev_sentences+1] < ${dir}/train_text_with_count | \
+    awk '{for (i=0; i<$1; i++) {print $0;} }' | cut -d ' ' -f 2- > \
+    ${dir}/data/text/train.txt
+
+  # Get text from NA News corpus 
+  for x in data/local/data/na_news/*; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
+
+  # Get text from 1996 CSR HUB4 LM corpus
+  for x in `cat data/local/data/csr96_hub4/{train,test}.filelist`; do
+    gunzip -c $x
+  done | gzip -c > ${dir}/data/text/csr96_hub4.txt.gz
+  
+  # Get text from 1995 CSR-IV HUB4 corpus
+  cat data/local/data/csr95_hub4/dev95_text \
+    data/local/data/csr95_hub4/eval95_text \
+    data/local/data/csr95_hub4/train95_text | cut -d ' ' -f 2- > \
+    ${dir}/data/text/csr95_hub4.txt
+
+  # Get text from NA News supplement corpus 
+  for x in data/local/data/na_news_supp; do
+    y=`basename $x`
+    [ -f $x/corpus.gz ] && ln -sf `readlink -f $x/corpus.gz` ${dir}/data/text/${y}.txt.gz
+  done
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  for x in dev96pe dev96ue eval96 eval97 eval98 eval99_1 eval99_2; do
+    cat data/$x/stm | awk '!/^;;/ {if (NF > 6) print $0}' | cut -d ' ' -f 1,7- | \
+      awk '!/IGNORE_TIME_SEGMENT_IN_SCORING/ {print $0}' | \
+      local/normalize_transcripts.pl "<NOISE>" "<SPOKEN_NOISE>" | \
+      cut -d ' ' -f 2- > ${dir}/data/${x}.txt
+  done
+fi
+
+if [ $stage -le 1 ]; then
+  mkdir -p $dir/data/work
+  if [ ! -f $dir/data/work/word_counts/.done ]; then
+    get_word_counts.py $dir/data/text $dir/data/work/word_counts
+    touch $dir/data/work/word_counts/.done
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # decide on the vocabulary.
+
+  # NA news corpus is not clean. So better not to get vocabulary from there.
+  # for x in data/local/data/na_news/*; do
+  #   y=$dir/data/work/word_counts/`basename $x`.counts
+  #   [ -f $y ] && cat $y 
+  # done | local/lm/merge_word_counts.py 15 > $dir/data/work/na_news.wordlist_counts
+
+  cat $dir/data/work/word_counts/{train,dev}.counts | \
+    local/lm/merge_word_counts.py 2 > $dir/data/work/train.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr96_hub4.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr96_hub4.wordlist_counts
+
+  cat $dir/data/work/word_counts/csr95_hub4.counts | \
+    local/lm/merge_word_counts.py 5 > $dir/data/work/csr95_hub4.wordlist_counts
+
+  cat $dir/data/work/{train,csr96_hub4,csr95_hub4}.wordlist_counts | \
+    perl -ane 'if ($F[1] =~ m/[A-Za-z]/) { print "$F[0] $F[1]\n"; }' | \
+    local/lm/merge_word_counts.py 1 | sort -k 1,1nr > $dir/data/work/final.wordlist_counts
+
+  if [ ! -z "$vocab_size" ]; then
+    awk -v sz=$vocab_size 'BEGIN{count=-1;} 
+    { i+=1; if (i == int(sz)) { count = $1; };
+      if (count > 0 && count != $1) { exit(0); } 
+      print $0;
+    }' $dir/data/work/final.wordlist_counts
+  else 
+    cat $dir/data/work/final.wordlist_counts
+  fi | awk '{print $2}' > $dir/data/work/wordlist
+fi
+
+order=4
+wordlist=$dir/data/work/wordlist
+
+min_counts='default=5 train=1 csr96_hub4=2,3 csr95_hub4=2,3'
+
+lm_name="`basename ${wordlist}`_${order}"
+if [ -n "${min_counts}" ]; then
+  lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "," "." | tr "=" "-"`"
+fi
+unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+
+if [ $stage -le 3 ]; then
+  echo "$0: training the unpruned LM"
+
+  $cmd ${unpruned_lm_dir}/log/train.log \
+    train_lm.py  --wordlist=$wordlist --num-splits=10 --warm-start-ratio=20  \
+                 --limit-unk-history=true \
+                 --fold-dev-into=train \
+                 --min-counts="${min_counts}" \
+                 ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${unpruned_lm_dir}/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${unpruned_lm_dir} 
+
+    cat ${unpruned_lm_dir}/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+  
+  # train_lm.py: You can set --bypass-metaparameter-optimization='0.829,0.997,0.066,0.014,0.171,0.244,0.063,0.001,0.023,0.004,0.014,0.006,0.018,0.027,0.082,1.000,0.004,0.007,0.024,0.703,0.108,0.046,0.019,0.848,0.258,0.208,0.195,0.889,0.297,0.282,0.242' to get equivalent results
+  # train_lm.py: Ngram counts: 98768 + 26286404 + 21077207 + 17945418 = 65407797
+  
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88365261291 per word [perplexity = 132.112338899] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.9299451353 per word [perplexity = 138.371920398] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.8308081807 per word [perplexity = 125.312194639] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.82377287988 per word [perplexity = 124.433679586] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -4.88114977878 per word [perplexity = 131.782097071] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01175279868 per word [perplexity = 150.167719384] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/wordlist_4_default-5_train-1_csr96_hub4-2.3_csr95_hub4-2.3.pocolm was -5.01485733132 per word [perplexity = 150.634644387] over 16395.0 words.
+  
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 10 million n-grams for a big LM for rescoring purposes.
+  size=10000000
+  $cmd ${dir}/data/lm_${order}_prune_big/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 \
+    ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_big
+
+    cat ${dir}/data/lm_${order}_prune_big/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_big was -4.96695051249 per word [perplexity = 143.588348177] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_big was -5.01232680304 per word [perplexity = 150.253941052] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_big was -4.91227395027 per word [perplexity = 135.948202644] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_big was -4.92411302883 per word [perplexity = 137.567269311] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_big was -4.97443821579 per word [perplexity = 144.667530381] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10483206523 per word [perplexity = 164.816389804] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_big was -5.10905926136 per word [perplexity = 165.514575655] over 16395.0 words.
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 2 million n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=2000000
+  
+  $cmd ${dir}/data/lm_${order}_prune_small/log/prune_lm.log \
+    prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big \
+    ${dir}/data/lm_${order}_prune_small
+
+  for x in dev96ue dev96pe eval96 eval97 eval98 eval99_1 eval99_2; do
+    $cmd ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log \
+      get_data_prob.py ${dir}/data/${x}.txt ${dir}/data/lm_${order}_prune_small
+
+    cat ${dir}/data/lm_${order}_prune_small/log/compute_data_prob_${x}.log | grep -F '[perplexity'
+  done
+
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96ue.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12459372596 per word [perplexity = 168.105830741] over 18771.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/dev96pe.txt given model data/local/local_lm/data/lm_4_prune_small was -5.16866547448 per word [perplexity = 175.680231224] over 23710.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval96.txt given model data/local/local_lm/data/lm_4_prune_small was -5.08096906048 per word [perplexity = 160.929931226] over 20553.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval97.txt given model data/local/local_lm/data/lm_4_prune_small was -5.09222677679 per word [perplexity = 162.751870937] over 33234.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval98.txt given model data/local/local_lm/data/lm_4_prune_small was -5.12842796263 per word [perplexity = 168.751625556] over 33180.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_1.txt given model data/local/local_lm/data/lm_4_prune_small was -5.26755997571 per word [perplexity = 193.942161054] over 11529.0 words.
+  # get_data_prob.py: log-prob of data/local/local_lm/data/eval99_2.txt given model data/local/local_lm/data/lm_4_prune_small was -5.27092234584 per word [perplexity = 194.595363921] over 16395.0 words
+
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/hub4_english/s5/path.sh b/egs/hub4_english/s5/path.sh
new file mode 100755
index 00000000000..49813fc4cd0
--- /dev/null
+++ b/egs/hub4_english/s5/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+if [ -f $KALDI_ROOT/tools/env.sh ]; then . $KALDI_ROOT/tools/env.sh; fi
+export PATH=$KALDI_ROOT/tools/sctk/bin:$PATH
+export LC_ALL=C
diff --git a/egs/hub4_english/s5/run.sh b/egs/hub4_english/s5/run.sh
new file mode 100755
index 00000000000..5db61d4eb10
--- /dev/null
+++ b/egs/hub4_english/s5/run.sh
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+# Copyright 2016   Vimal Manohar
+# Apache 2.0.
+
+# See README.txt for more info on data required.
+
+. ./cmd.sh
+. ./path.sh
+
+set -o pipefail
+set -e
+
+mfccdir=`pwd`/mfcc
+nj=40
+stage=-1
+
+. utils/parse_options.sh
+
+# Training corpora
+
+# 1996 English Broadcast News Train (HUB4)
+hub4_96_train_transcripts=/export/corpora/LDC/LDC97T22/hub4_eng_train_trans
+hub4_96_train_speech=/export/corpora/LDC/LDC97S44/data
+# 1997 English Broadcast News Train (HUB4)
+hub4_97_train_transcripts=/export/corpora/LDC/LDC98T28/hub4e97_trans_980217
+hub4_97_train_speech=/export/corpora/LDC/LDC98S71/97_eng_bns_hub4
+# 1996 CSR HUB4 Language Model
+csr_hub4_lm=/export/corpora/LDC/LDC98T31/1996_csr_hub4_model
+# 1995 CSR-IV HUB4 corpus
+csr95_hub4=/export/corpora/LDC/LDC96S31/csr95_hub4
+# North American News Text Corpus
+NA_text=/export/corpora/LDC/LDC95T21
+# North American News Text Supplement Corpus
+NA_text_supp=/export/corpora/LDC/LDC98T30/northam_news_txt_sup
+
+# Test corpora
+
+# 1996 English Broadcast News Dev and Eval (HUB4)
+hub4_96_eval=/export/corpora/LDC/LDC97S66/1996_eng_bcast_dev_eval
+# 1997 HUB4 English Evaluation corpus
+hub4_97_eval=/export/corpora/LDC/LDC2002S11/hub4e_97
+# 1998 HUB4 Broadcast News Evaluation English Test Material
+hub4_98_eval=/export/corpora/LDC/LDC2000S86
+# 1999 HUB4 Broadcast News Evaluation English Test Material
+hub4_99_eval=/export/corpora5/LDC/LDC2000S88/hub4_1999
+
+# Test sets used -- Uncomment and keep only test sets needed
+test_sets="eval97.pem"
+# test_sets="dev96ue dev96pe eval96 eval96.pem eval97 eval97.pem eval98 eval98.pem eval99_1 eval99_1.pem eval99_2 eval99_2.pem"
+
+if [ $stage -le 0 ]; then
+  # Prepare 1996 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1996_bn_data.sh \
+    $hub4_96_train_transcripts \
+    $hub4_96_train_speech \
+    data/local/data/train_bn96
+
+  # Prepare 1997 English Broadcast News Train (HUB4)
+  local/data_prep/prepare_1997_bn_data.sh \
+    $hub4_97_train_transcripts \
+    $hub4_97_train_speech \
+    data/local/data/train_bn97
+fi
+
+# Install Beautiful Soup 4 python package for parsing SGML-like files
+# in CSR-IV HUB4 corpus
+if [ ! -d tools/beautifulsoup4 ]; then
+  mkdir -p tools
+  pip install -t tools/beautifulsoup4 beautifulsoup4
+fi
+export PYTHONPATH=$PWD/tools/beautifulsoup4:$PYTHONPATH
+
+if [ $stage -le 1 ]; then
+  if [ ! -f $csr_hub4_lm/utils.tar ]; then
+    echo "Expected CSR-IV utils.tar to be found"
+    exit 1
+  fi
+
+  mkdir -p tools/csr4_utils
+  (
+    cd tools/csr4_utils
+    tar -xvf $csr_hub4_lm/utils.tar
+  )
+
+  chmod a+w tools/csr4_utils
+  patch -u -d tools/csr4_utils -p3 < local/data_prep/csr4_utils.patch
+fi
+
+if [ $stage -le 2 ]; then
+  # Prepare 1995 CSR-IV HUB4 corpus
+  local/data_prep/prepare_1995_csr_hub4_corpus.sh \
+    $csr95_hub4 data/local/data/csr95_hub4
+fi
+
+if [ $stage -le 3 ]; then
+  # Prepare North American News Text Corpus
+  local/data_prep/prepare_na_news_text_corpus.sh --nj 40 --cmd "$train_cmd" \
+     $NA_text data/local/data/na_news
+
+  # Prepare North American News Text Supplement Corpus
+  local/data_prep/prepare_na_news_text_supplement.sh --nj 10 --cmd "$train_cmd" \
+    $NA_text_supp data/local/data/na_news_supp
+fi
+
+if [ $stage -le 4 ]; then
+  # Prepare 1996 CSR HUB4 Language Model
+  local/data_prep/prepare_1996_csr_hub4_lm_corpus.sh --nj 10 --cmd "$train_cmd" \
+     $csr_hub4_lm data/local/data/csr96_hub4
+fi
+
+if [ $stage -le 5 ]; then
+  # Prepare 1996 English Broadcast News Dev and Eval (HUB4)
+  local/data_prep/prepare_1996_hub4_bn_eng_dev_and_eval.sh \
+    $hub4_96_eval \
+    data/local/data/hub4_96_dev_eval
+
+  # Prepare 1997 HUB4 English Evaluation corpus
+  local/data_prep/prepare_1997_hub4_bn_eng_eval.sh \
+    $hub4_97_eval data/local/data/eval97
+
+  # Prepare 1998 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1998_hub4_bn_eng_eval.sh \
+    $hub4_98_eval data/local/data/eval98
+
+  # Prepare 1999 HUB4 Broadcast News Evaluation English Test Material
+  local/data_prep/prepare_1999_hub4_bn_eng_eval.sh \
+    $hub4_99_eval data/local/data/eval99
+fi
+
+if [ $stage -le 6 ]; then
+  local/format_data.sh
+fi
+
+if [ $stage -le 7 ]; then
+  local/train_lm.sh
+fi
+
+if [ $stage -le 8 ]; then
+  local/prepare_dict.sh --dict-suffix "_nosp" \
+    data/local/local_lm/data/work/wordlist
+
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_tmp_nosp data/lang_nosp
+fi
+
+if [ $stage -le 9 ]; then
+  local/format_lms.sh --local-lm-dir data/local/local_lm
+fi
+
+if [ $stage -le 10 ]; then
+  for x in train $test_sets; do
+    this_nj=$(cat data/$x/utt2spk | wc -l)
+    if [ $this_nj -gt 30 ]; then
+      this_nj=30
+    fi
+
+    steps/make_mfcc.sh --mfcc-config conf/mfcc.conf --nj $this_nj \
+      --cmd "$train_cmd" \
+      data/$x exp/make_mfcc $mfccdir
+    steps/compute_cmvn_stats.sh data/$x exp/make_mfcc $mfccdir
+    utils/fix_data_dir.sh data/$x
+  done
+fi
+
+if [ $stage -le 15 ]; then
+  utils/subset_data_dir.sh --shortest data/train 1000 data/train_1kshort
+  utils/subset_data_dir.sh data/train 2000 data/train_2k
+
+  # Note: the --boost-silence option should probably be omitted by default
+  # for normal setups.  It doesn't always help. [it's to discourage non-silence
+  # models from modeling silence.]
+  steps/train_mono.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_1kshort data/lang_nosp exp/mono0a
+fi
+
+if [ $stage -le 16 ]; then
+  steps/align_si.sh --boost-silence 1.25 --nj $nj --cmd "$train_cmd" \
+    data/train_2k data/lang_nosp exp/mono0a exp/mono0a_ali
+
+  steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" 2000 10000 \
+    data/train_2k data/lang_nosp exp/mono0a_ali exp/tri1
+fi
+
+if [ $stage -le 17 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri1 exp/tri1_ali
+
+  steps/train_lda_mllt.sh --cmd "$train_cmd" 2500 15000 \
+    data/train data/lang_nosp exp/tri1_ali exp/tri2
+fi
+
+if [ $stage -le 18 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri2 exp/tri2_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 4200 40000 \
+    data/train data/lang_nosp exp/tri2_ali exp/tri3
+fi
+
+if [ $stage -le 19 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+
+  for dset in $test_sets; do
+    (
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri3/graph_nosp data/$dset exp/tri3/decode_nosp_${dset} || touch exp/tri3/.error
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri3/decode_nosp_${dset} \
+      exp/tri3/decode_nosp_${dset}_rescore || touch exp/tri3/.error
+    ) &
+  done
+  wait
+
+  if [ -f exp/tri3/.error ]; then
+    echo "Decode failed in exp/tri3/decode*"
+    exit 1
+  fi
+fi
+
+if [ $stage -le 20 ]; then
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp exp/tri3 exp/tri3_ali
+
+  steps/train_sat.sh --cmd "$train_cmd" 5000 100000 \
+    data/train data/lang_nosp exp/tri3_ali exp/tri4
+fi
+
+if [ $stage -le 21 ]; then
+  utils/mkgraph.sh data/lang_nosp_test exp/tri4 exp/tri4/graph_nosp
+
+  for dset in $test_sets; do
+    (
+    this_nj=`cat data/$dset/spk2utt | wc -l`
+    if [ $this_nj -gt 20 ]; then
+      this_nj=20
+    fi
+    steps/decode_fmllr.sh --nj $this_nj --cmd "$decode_cmd" --num-threads 4 \
+      exp/tri4/graph_nosp data/$dset exp/tri4/decode_nosp_${dset}
+    steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+      data/lang_nosp_test data/lang_nosp_test_rescore \
+      data/${dset} exp/tri4/decode_nosp_${dset} \
+      exp/tri4/decode_nosp_${dset}_rescore
+    ) &
+  done
+  wait
+
+  if [ -f exp/tri4/.error ]; then
+    echo "Decode failed in exp/tri4/decode*"
+    exit 1
+  fi
+fi
+
+wait
+
+# %WER 18.0 | 728 32834 | 83.9 11.7 4.3 2.0 18.0 85.9 | exp/tri4/decode_nosp_eval97.pem_rescore/score_14_0.0/eval97.pem.ctm.filt.sys
+# %WER 19.3 | 728 32834 | 82.9 12.6 4.6 2.2 19.3 86.8 | exp/tri4/decode_nosp_eval97.pem/score_13_0.0/eval97.pem.ctm.filt.sys
+
+# The following demonstrates how to use out-of-domain WSJ models to segment long
+# audio recordings of HUB4 with raw unaligned transcripts into short segments
+# with aligned transcripts for training new ASR models.
+
+# local/run_segmentation_wsj.sh
+exit 0
diff --git a/egs/hub4_english/s5/steps b/egs/hub4_english/s5/steps
new file mode 120000
index 00000000000..6e99bf5b5ad
--- /dev/null
+++ b/egs/hub4_english/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps
\ No newline at end of file
diff --git a/egs/hub4_english/s5/utils b/egs/hub4_english/s5/utils
new file mode 120000
index 00000000000..b240885218f
--- /dev/null
+++ b/egs/hub4_english/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils
\ No newline at end of file
diff --git a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
index e940183497b..848ca61ebe4 100755
--- a/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
+++ b/egs/wsj/s5/steps/cleanup/internal/align_ctm_ref.py
@@ -488,20 +488,24 @@ def ctm_line_to_string(ctm_line):
     return " ".join([str(x) for x in ctm_line])
 
 
-def test_alignment():
-    hyp = "ACACACTA"
+def test_alignment(align_full_hyp):
+    hyp = "GCCAT"
     ref = "AGCACACA"
 
+    verbose = 3
+    logger.info("REF: %s", ref)
+    logger.info("HYP: %s", hyp)
+
     output, score = smith_waterman_alignment(
         ref, hyp, similarity_score_function=lambda x, y: 2 if (x == y) else -1,
-        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=True)
+        del_score=-1, ins_score=-1, eps_symbol="-", align_full_hyp=align_full_hyp)
 
     print_alignment("Alignment", output, out_file_handle=sys.stderr)
 
 
 def run(args):
     if args.debug_only:
-        test_alignment()
+        test_alignment(args.align_full_hyp)
         raise SystemExit("Exiting since --debug-only was true")
 
     def similarity_score_function(x, y):
diff --git a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh b/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
deleted file mode 100755
index 35c0a4bd3a8..00000000000
--- a/egs/wsj/s5/steps/cleanup/internal/get_ctm.sh
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/bin/bash
-# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
-# Copyright 2017  Vimal Manohar
-
-# This script produces CTM files from a decoding directory that has lattices
-# present.
-# This is similar to get_ctm.sh, but gets the
-# CTM at the utterance-level.
-
-
-# begin configuration section.
-cmd=run.pl
-stage=0
-frame_shift=0.01
-lmwt=10
-print_silence=false
-#end configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-[ -f ./path.sh ] && . ./path.sh
-. parse_options.sh || exit 1;
-
-if [ $# -ne 3 ]; then
-  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir>"
-  echo " Options:"
-  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
-  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
-  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
-  echo "                                    # not equal to 0.01 seconds"
-  echo "e.g.:"
-  echo "$0 data/train data/lang exp/tri4a/decode/"
-  echo "See also: steps/get_train_ctm.sh"
-  exit 1;
-fi
-
-data=$1
-lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
-dir=$3
-
-if [ -f $dir/final.mdl ]; then
-  model=$dir/final.mdl
-else
-  model=$dir/../final.mdl # assume model one level up from decoding dir.
-fi
-
-for f in $lang/words.txt $model $dir/lat.1.gz; do
-  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
-done
-
-name=`basename $data`; # e.g. eval2000
-
-mkdir -p $dir/scoring/log
-
-if [ $stage -le 0 ]; then
-  nj=$(cat $dir/num_jobs)
-  if [ -f $lang/phones/word_boundary.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_$lmwt/${name}.ctm.JOB || exit 1;
-  elif [ -f $lang/phones/align_lexicon.int ]; then
-    $cmd JOB=1:$nj $dir/scoring/log/get_ctm.JOB.log \
-      set -o pipefail '&&' mkdir -p $dir/score_$lmwt/ '&&' \
-      lattice-1best --lm-scale=$lmwt "ark:gunzip -c $dir/lat.JOB.gz|" ark:- \| \
-      lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
-      lattice-1best ark:- ark:- \| \
-      nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
-      utils/int2sym.pl -f 5 $lang/words.txt \
-      '>' $dir/score_${lmwt}/${name}.ctm.JOB || exit 1;
-  else
-    echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
-    exit 1;
-  fi
-fi
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index 7d83fbb29f8..16350fdb032 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -6,7 +6,7 @@
 
 . ./path.sh
 
-set -e 
+set -e
 set -o pipefail
 set -u
 
@@ -24,13 +24,13 @@ lmwt=10
 
 # TF-IDF similarity search options
 max_words=1000
-num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity. 
-neighbor_tfidf_threshold=0.5   
+num_neighbors_to_search=1   # Number of neighboring documents to search around the one retrieved based on maximum tf-idf similarity.
+neighbor_tfidf_threshold=0.5
 
 align_full_hyp=false  # Align full hypothesis i.e. trackback from the end to get the alignment.
 
 # First-pass segmentation opts
-# These options are passed to the script 
+# These options are passed to the script
 # steps/cleanup/internal/segment_ctm_edits_mild.py
 segmentation_extra_opts=
 min_split_point_duration=0.1
@@ -56,11 +56,11 @@ Usage: $0 [options] <model-dir> <lang> <data-in> [<text-in> <utt2text>] <segment
  e.g.: $0 exp/wsj_tri2b data/lang_nosp data/train_long data/train_long/text data/train_reseg exp/segment_wsj_long_utts_train
 This script performs segmentation of the data in <data-in> and writes out the
 segmented data (with a segments file) to
-<segmented-data-out> along with the corresponding aligned transcription.  
-Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the 
+<segmented-data-out> along with the corresponding aligned transcription.
+Note: If <utt2text> is not provided, the "text" file in <data-in> is used as the
 raw transcripts to train biased LM for the utterances.
-If <utt2text> is provided, then it should be a mapping from the utterance-ids in 
-<data-in> to the transcript-keys in the file <text-in>, which will be 
+If <utt2text> is provided, then it should be a mapping from the utterance-ids in
+<data-in> to the transcript-keys in the file <text-in>, which will be
 used to train biased LMs for the utterances.
 The purpose of this script is to divide up the input data (which may consist of
 long recordings such as television shows or audiobooks) into segments which are
@@ -86,7 +86,7 @@ if [ $# -eq 7 ]; then
   out_data=$6
   dir=$7
   extra_files="$utt2text"
-else 
+else
   out_data=$4
   dir=$5
 fi
@@ -103,12 +103,12 @@ data_id=`basename $data`
 mkdir -p $dir
 
 data_uniform_seg=$dir/${data_id}_uniform_seg
-  
+
 frame_shift=`utils/data/get_frame_shift.sh $data`
 
-# First we split the data into segments of around 30s long, on which 
-# it would be possible to do a decoding. 
-# A diarization step will be added in the future. 
+# First we split the data into segments of around 30s long, on which
+# it would be possible to do a decoding.
+# A diarization step will be added in the future.
 if [ $stage -le 1 ]; then
   echo "$0: Stage 1 (Splitting data directory $data into uniform segments)"
 
@@ -133,12 +133,12 @@ if [ $stage -le 2 ]; then
       $data $dir/uniform_sub_segments $dir/${data_id}_uniform_seg.temp
 
     utils/data/modify_speaker_info.sh --seconds-per-spk-max $seconds_per_spk_max \
-      $dir/${data_id}_uniform_seg.temp $data_uniform_seg 
+      $dir/${data_id}_uniform_seg.temp $data_uniform_seg
   else
     utils/data/subsegment_data_dir.sh \
       $data $dir/uniform_sub_segments $data_uniform_seg
   fi
-  
+
   utils/fix_data_dir.sh $data_uniform_seg
 
   # Compute new cmvn stats for the segmented data directory
@@ -157,19 +157,19 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
-  
-  # Make graphs w.r.t. to the original text (usually recording-level) 
+
+  # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
       sort -k1,1 > \
       $graph_dir/HCLG.fsts.scp
   else
-    # and then copy it to the sub-segments. 
+    # and then copy it to the sub-segments.
     cat $dir/uniform_sub_segments | awk '{print $1" "$2}' | \
       utils/apply_map.pl -f 2 $utt2text | \
       utils/apply_map.pl -f 2 $dir/graphs/HCLG.fsts.scp | \
@@ -187,13 +187,13 @@ mkdir -p $decode_dir
 
 if [ $stage -le 4 ]; then
   echo "$0: Decoding with biased language models..."
-  
+
   if [ -f $srcdir/trans.1 ]; then
     steps/cleanup/decode_fmllr_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
       $graph_dir $data_uniform_seg $decode_dir
-  else 
+  else
     steps/cleanup/decode_segmentation.sh \
       --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
       --skip-scoring true --allow-partial false \
@@ -202,25 +202,24 @@ if [ $stage -le 4 ]; then
 fi
 
 if [ $stage -le 5 ]; then
-  steps/cleanup/internal/get_ctm.sh \
-    --lmwt $lmwt --cmd "$cmd --mem 4G" \
+  steps/get_ctm_fast.sh --lmwt $lmwt --cmd "$cmd --mem 4G" \
     --print-silence true \
-    $data_uniform_seg $lang $decode_dir
+    $data_uniform_seg $lang $decode_dir $decode_dir/ctm_$lmwt
 fi
 
-# Split the original text into documents, over which we can do 
+# Split the original text into documents, over which we can do
 # searching reasonably efficiently. Also get a mapping from the original
 # text to the created documents (i.e. text2doc)
-# Since the Smith-Waterman alignment is linear in the length of the 
-# text, we want to keep it reasonably small (a few thousand words). 
+# Since the Smith-Waterman alignment is linear in the length of the
+# text, we want to keep it reasonably small (a few thousand words).
 
 if [ $stage -le 6 ]; then
   # Split the reference text into documents.
   mkdir -p $dir/docs
-    
+
   # text2doc is a mapping from the original transcript to the documents
   # it is split into.
-  # The format is 
+  # The format is
   # <original-transcript> <doc1> <doc2> ...
   steps/cleanup/internal/split_text_into_docs.pl --max-words $max_words \
     $text $dir/docs/doc2text $dir/docs/docs.txt
@@ -230,11 +229,11 @@ fi
 if [ $stage -le 7 ]; then
   # Get TF-IDF for the reference documents.
   echo $nj > $dir/docs/num_jobs
-  
+
   utils/split_data.sh $data_uniform_seg $nj
 
   mkdir -p $dir/docs/split$nj/
-  
+
   # First compute IDF stats
   $cmd $dir/log/compute_source_idf_stats.log \
     steps/cleanup/internal/compute_tf_idf.py \
@@ -242,23 +241,23 @@ if [ $stage -le 7 ]; then
     --idf-weighting-scheme="log" \
     --output-idf-stats=$dir/docs/idf_stats.txt \
     $dir/docs/docs.txt $dir/docs/src_tf_idf.txt
-  
+
   # Split documents so that they can be accessed easily by parallel jobs.
   mkdir -p $dir/docs/split$nj/
   sdir=$dir/docs/split$nj
   for n in `seq $nj`; do
 
-    # old2new_utts is a mapping from the original segments to the 
+    # old2new_utts is a mapping from the original segments to the
     # new segments created by uniformly segmenting.
     # The format is <old-utterance> <new-utt1> <new-utt2> ...
     utils/filter_scp.pl $data_uniform_seg/split$nj/$n/utt2spk $dir/uniform_sub_segments | \
       cut -d ' ' -f 1,2 | utils/utt2spk_to_spk2utt.pl > $sdir/old2new_utts.$n.txt
 
     if [ ! -z "$utt2text" ]; then
-      # utt2text, if provided, is a mapping from the <old-utterance> to 
+      # utt2text, if provided, is a mapping from the <old-utterance> to
       # <original-transript>.
-      # Since text2doc is mapping from <original-transcript> to documents, we 
-      # first have to find the original-transcripts that are in the current 
+      # Since text2doc is mapping from <original-transcript> to documents, we
+      # first have to find the original-transcripts that are in the current
       # split.
       utils/filter_scp.pl $sdir/old2new_utts.$n.txt $utt2text | \
         cut -d ' ' -f 2 | sort -u | \
@@ -273,13 +272,13 @@ if [ $stage -le 7 ]; then
       $sdir/docs.$n.txt
   done
 
-  # Compute TF-IDF for the source documents. 
+  # Compute TF-IDF for the source documents.
   $cmd JOB=1:$nj $dir/docs/log/get_tfidf_for_source_texts.JOB.log \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="raw" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
-      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt 
+      $sdir/docs.JOB.txt $sdir/src_tf_idf.JOB.txt
 
   sdir=$dir/docs/split$nj
   # Make $sdir an absolute pathname.
@@ -288,15 +287,15 @@ if [ $stage -le 7 ]; then
   for n in `seq $nj`; do
     awk -v f="$sdir/src_tf_idf.$n.txt" '{print $1" "f}' \
       $sdir/text2doc.$n
-  done | perl -ane 'BEGIN { %tfidfs = (); } 
-  { 
-    if (!defined $tfidfs{$F[0]}) { 
-      $tfidfs{$F[0]} = $F[1]; 
-    } 
-  } 
+  done | perl -ane 'BEGIN { %tfidfs = (); }
+  {
+    if (!defined $tfidfs{$F[0]}) {
+      $tfidfs{$F[0]} = $F[1];
+    }
+  }
   END {
-  while(my ($k, $v) = each %tfidfs) { 
-    print "$k $v\n"; 
+  while(my ($k, $v) = each %tfidfs) {
+    print "$k $v\n";
   } }' > $dir/docs/source2tf_idf.scp
 fi
 
@@ -317,18 +316,18 @@ if [ $stage -le 9 ]; then
   sdir=$dir/query_docs/split$nj
   mkdir -p $sdir
 
-  # Compute TF-IDF for the query documents (decode hypotheses). 
+  # Compute TF-IDF for the query documents (decode hypotheses).
   # The output is an archive of TF-IDF indexed by the query.
-  $cmd JOB=1:$nj $dir/lats/log/compute_query_tf_idf.JOB.log \
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/compute_query_tf_idf.JOB.log \
     steps/cleanup/internal/ctm_to_text.pl --non-scored-words $dir/non_scored_words.txt \
-      $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB \| \
+      $decode_dir/ctm_$lmwt/ctm.JOB \| \
     steps/cleanup/internal/compute_tf_idf.py \
       --tf-weighting-scheme="normalized" \
       --idf-weighting-scheme="log" \
       --input-idf-stats=$dir/docs/idf_stats.txt \
       --accumulate-over-docs=false \
       - $sdir/query_tf_idf.JOB.ark.txt
-  
+
   # The relevant documents can be found using TF-IDF similarity and nearby
   # documents can also be picked for the Smith-Waterman alignment stage.
 
@@ -345,15 +344,15 @@ if [ $stage -le 9 ]; then
   # The query TF-IDFs are all indexed by the utterance-id of the sub-segments.
   # The source TF-IDFs use the document-ids created by splitting the reference
   # text into documents.
-  # For each query, we need to retrieve the documents that were created from 
-  # the same original utterance that the sub-segment was from. For this, 
-  # we have to load the source TF-IDF that has those documents. This 
+  # For each query, we need to retrieve the documents that were created from
+  # the same original utterance that the sub-segment was from. For this,
+  # we have to load the source TF-IDF that has those documents. This
   # information is provided using the option --source-text-id2tf-idf-file.
-  # The output of this script is a file where the first column is the 
+  # The output of this script is a file where the first column is the
   # query-id (i.e. sub-segment-id) and the remaining columns, which is at least
   # one in number and a maxmium of (1 + 2 * num-neighbors-to-search) columns
   # is the document-ids for the retrieved documents.
-  $cmd JOB=1:$nj $dir/lats/log/retrieve_similar_docs.JOB.log \
+  $cmd JOB=1:$nj $dir/log/retrieve_similar_docs.JOB.log \
     steps/cleanup/internal/retrieve_similar_docs.py \
       --query-tfidf=$dir/query_docs/split$nj/query_tf_idf.JOB.ark.txt \
       --source-text-id2tfidf=$dir/docs/source2tf_idf.scp \
@@ -362,8 +361,8 @@ if [ $stage -le 9 ]; then
       --num-neighbors-to-search=$num_neighbors_to_search \
       --neighbor-tfidf-threshold=$neighbor_tfidf_threshold \
       --relevant-docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt
-  
-  $cmd JOB=1:$nj $dir/lats/log/get_ctm_edits.JOB.log \
+
+  $cmd JOB=1:$nj $decode_dir/ctm_$lmwt/log/get_ctm_edits.JOB.log \
     steps/cleanup/internal/stitch_documents.py \
       --query2docs=$dir/query_docs/split$nj/relevant_docs.JOB.txt \
       --input-documents=$dir/docs/split$nj/docs.JOB.txt \
@@ -371,18 +370,18 @@ if [ $stage -le 9 ]; then
     steps/cleanup/internal/align_ctm_ref.py --eps-symbol='"<eps>"' \
       --oov-word="'`cat $lang/oov.txt`'" --symbol-table=$lang/words.txt \
       --hyp-format=CTM --align-full-hyp=$align_full_hyp \
-      --hyp=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm.JOB --ref=- \
-      --output=$dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.JOB 
-  
+      --hyp=$decode_dir/ctm_$lmwt/ctm.JOB --ref=- \
+      --output=$decode_dir/ctm_$lmwt/ctm_edits.JOB
+
   for n in `seq $nj`; do
-    cat $dir/lats/score_$lmwt/${data_id}_uniform_seg.ctm_edits.$n 
-  done > $dir/lats/score_$lmwt/ctm_edits
-  
+    cat $decode_dir/ctm_$lmwt/ctm_edits.$n
+  done > $decode_dir/ctm_$lmwt/ctm_edits
+
 fi
 
 if [ $stage -le 10 ]; then
   steps/cleanup/internal/resolve_ctm_edits_overlaps.py \
-    ${data_uniform_seg}/segments $dir/lats/score_$lmwt/ctm_edits $dir/ctm_edits
+    ${data_uniform_seg}/segments $decode_dir/ctm_$lmwt/ctm_edits $dir/ctm_edits
 fi
 
 if [ $stage -le 11 ]; then
@@ -421,7 +420,7 @@ if [ $stage -le 13 ]; then
   --splitting.min-silence-length=$min_silence_length_to_split_at
   --splitting.min-non-scored-length=$min_non_scored_length_to_split_at
   )
-  
+
   $cmd $dir/log/segment_ctm_edits.log \
     steps/cleanup/internal/segment_ctm_edits_mild.py \
       ${segmentation_opts[@]} $segmentation_extra_opts \
diff --git a/egs/wsj/s5/steps/dict/train_g2p.sh b/egs/wsj/s5/steps/dict/train_g2p.sh
index 2e4df49b71b..d793bbb5d8f 100755
--- a/egs/wsj/s5/steps/dict/train_g2p.sh
+++ b/egs/wsj/s5/steps/dict/train_g2p.sh
@@ -44,8 +44,9 @@ mkdir -p $wdir/log
 
 # Optionally remove words that are mapped to a single silence phone from the lexicon.
 if $only_words && [ ! -z "$silence_phones" ]; then
-  awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i;a[$1]=s;if(!(s in a)) print $1" "s}' \
-    $silence_phones > $wdir/lexicon_onlywords.txt
+  awk -v s=$silence_phones \
+    'BEGIN{while((getline<s)>0) {for(i=1;i<=NF;i++) sil[$i]=1;}}
+    {if (!(NF == 2 && $2 in sil)) print;}' $lexicon > $wdir/lexicon_onlywords.txt
   lexicon=$wdir/lexicon_onlywords.txt
 fi
 
diff --git a/egs/wsj/s5/steps/get_ctm_fast.sh b/egs/wsj/s5/steps/get_ctm_fast.sh
new file mode 100755
index 00000000000..613061f7df8
--- /dev/null
+++ b/egs/wsj/s5/steps/get_ctm_fast.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+# Copyright Johns Hopkins University (Author: Daniel Povey) 2012.  Apache 2.0.
+# Copyright 2017  Vimal Manohar
+
+# This script produces CTM files from a decoding directory that has lattices
+# present.
+# This is similar to get_ctm.sh, but gets the CTM at the utterance-level.
+# It can be faster than steps/get_ctm.sh --use-segments false as it splits
+# the process across many jobs.
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+frame_shift=0.01
+lmwt=10
+print_silence=false
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 4 ]; then
+  echo "Usage: $0 [options] <data-dir> <lang-dir|graph-dir> <decode-dir> <ctm-out-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --frame-shift (default=0.01)    # specify this if your lattices have a frame-shift"
+  echo "                                    # not equal to 0.01 seconds"
+  echo "e.g.:"
+  echo "$0 data/train data/lang exp/tri4a/decode/"
+  echo "See also: steps/get_ctm.sh"
+  exit 1;
+fi
+
+data=$1
+lang=$2 # Note: may be graph directory not lang directory, but has the necessary stuff copied.
+decode_dir=$3
+dir=$4
+
+if [ -f $decode_dir/final.mdl ]; then
+  model=$decode_dir/final.mdl
+else
+  model=$decode_dir/../final.mdl # assume model one level up from decoding dir.
+fi
+
+for f in $lang/words.txt $model $decode_dir/lat.1.gz; do
+  [ ! -f $f ] && echo "$0: expecting file $f to exist" && exit 1;
+done
+
+mkdir -p $dir
+
+nj=$(cat $decode_dir/num_jobs)
+echo $nj > $dir/num_jobs
+
+if [ -f $lang/phones/word_boundary.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words $lang/phones/word_boundary.int $model ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+elif [ -f $lang/phones/align_lexicon.int ]; then
+  $cmd JOB=1:$nj $dir/log/get_ctm.JOB.log \
+    set -o pipefail '&&' \
+    lattice-1best --lm-scale=$lmwt "ark:gunzip -c $decode_dir/lat.JOB.gz|" ark:- \| \
+    lattice-align-words-lexicon $lang/phones/align_lexicon.int $model ark:- ark:- \| \
+    lattice-1best ark:- ark:- \| \
+    nbest-to-ctm --frame-shift=$frame_shift --print-silence=$print_silence ark:- - \| \
+    utils/int2sym.pl -f 5 $lang/words.txt \
+    '>' $dir/ctm.JOB || exit 1;
+else
+  echo "$0: neither $lang/phones/word_boundary.int nor $lang/phones/align_lexicon.int exists: cannot align."
+  exit 1;
+fi
+
+for n in `seq $nj`; do 
+  cat $dir/ctm.$n
+done > $dir/ctm
diff --git a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
index 0d6713d52b8..5db6be731ce 100755
--- a/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
+++ b/egs/wsj/s5/utils/data/convert_data_dir_to_whole.sh
@@ -1,6 +1,6 @@
 #! /bin/bash
 
-# Copyright 2016  Vimal Manohar
+# Copyright 2016-2018  Vimal Manohar
 # Apache 2.0
 
 # This scripts converts a data directory into a "whole" data directory
@@ -11,9 +11,7 @@ set -o pipefail
 
 . ./path.sh
 
-cmd=run.pl
-
-. parse_options.sh
+. utils/parse_options.sh
 
 if [ $# -ne 2 ]; then
   echo "Usage: convert_data_dir_to_whole.sh <in-data> <out-data>"
@@ -32,75 +30,27 @@ fi
 
 mkdir -p $dir
 cp $data/wav.scp $dir
-cp $data/reco2file_and_channel $dir
-rm -f $dir/{utt2spk,text} || true
+if [ -f $data/reco2file_and_channel ]; then 
+  cp $data/reco2file_and_channel $dir; 
+fi
+
+mkdir -p $dir/.backup
+mv $dir/feats.scp $dir/cmvn.scp $dir/.backup
+
+rm $dir/utt2spk || true
 
 [ -f $data/stm ] && cp $data/stm $dir
 [ -f $data/glm ] && cp $data/glm $dir
 
-text_files=
-[ -f $data/text ] && text_files="$data/text $dir/text"
-
-# Combine utt2spk and text from the segments into utt2spk and text for the whole
-# recording.
-cat $data/segments | perl -e '
-if (scalar @ARGV == 3) {
-  ($utt2spk_in, $text_in, $text_out) = @ARGV;
-} elsif (scalar @ARGV == 1) {
-  $utt2spk_in = $ARGV[0];
-} else {
-  die "Unexpected number of arguments";
-}
-
-if (defined $text_in) {
-  open(TI, "<$text_in") || die "Error: fail to open $text_in\n";
-  open(TO, ">$text_out") || die "Error: fail to open $text_out\n";
-}
-open(UI, "<$utt2spk_in") || die "Error: fail to open $utt2spk_in\n";
-
-my %file2utt = ();
-while (<STDIN>) {
-  chomp;
-  my @col = split;
-  @col >= 4 or die "bad line $_\n";
-
-  if (! defined $file2utt{$col[1]}) {
-    $file2utt{$col[1]} = [];
-  }
-  push @{$file2utt{$col[1]}}, $col[0]; 
-}
-
-my %text = ();
-my %utt2spk = ();
-
-while (<UI>) {
-  chomp; 
-  my @col = split;
-  $utt2spk{$col[0]} = $col[1];
-}
-
-if (defined $text_in) {
-  while (<TI>) {
-    chomp;
-    my @col = split;
-    @col >= 1 or die "bad line $_\n";
-
-    my $utt = shift @col;
-    $text{$utt} = join(" ", @col);
-  }
-}
-
-foreach $file (keys %file2utt) {
-  my @utts = @{$file2utt{$file}};
-  print "$file $file\n";
-
-  if (defined $text_in) {
-    $text_line = "";
-    print TO "$file $text_line\n";
-  }
-}
-' $data/utt2spk $text_files > $dir/utt2spk
-
-utils/spk2utt_to_utt2spk.pl $dir/utt2spk > $dir/spk2utt
-
-utils/fix_data_dir.sh $dir
+utils/data/internal/combine_segments_to_recording.py \
+  --write-reco2utt=$dir/reco2sorted_utts $data/segments $dir/utt2spk || exit 1
+
+if [ -f $data/text ]; then
+  utils/apply_map.pl -f 2 $data/text < $dir/reco2sorted_utts > $dir/text || exit 1
+fi
+
+rm $dir/reco2sorted_utts
+
+utils/fix_data_dir.sh $dir || exit 1
+
+exit 0
diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh
index c415e8dfb81..e6a344d7d50 100755
--- a/egs/wsj/s5/utils/data/get_utt2dur.sh
+++ b/egs/wsj/s5/utils/data/get_utt2dur.sh
@@ -11,6 +11,8 @@
 # files in entirely.)
 
 frame_shift=0.01
+cmd=run.pl
+nj=4
 
 . utils/parse_options.sh
 . ./path.sh
@@ -80,11 +82,17 @@ elif [ -f $data/wav.scp ]; then
       echo "... perturb_data_dir_speed_3way.sh."
     fi
 
-    if ! wav-to-duration --read-entire-file=$read_entire_file scp:$data/wav.scp ark,t:$data/utt2dur 2>&1 | grep -v 'nonzero return status'; then
-      echo "$0: there was a problem getting the durations; moving $data/utt2dur to $data/.backup/"
-      mkdir -p $data/.backup/
-      mv $data/utt2dur $data/.backup/
-    fi
+    utils/data/split_data.sh --per-utt $data $nj
+    sdata=$data/split${nj}utt
+
+    $cmd JOB=1:$nj $data/log/get_durations.JOB.log \
+      wav-to-duration --read-entire-file=$read_entire_file \
+      scp:$sdata/JOB/wav.scp ark,t:$sdata/JOB/utt2dur || \
+        { echo "$0: there was a problem getting the durations"; exit 1; }
+
+    for n in `seq $nj`; do
+      cat $sdata/$n/utt2dur
+    done > $data/utt2dur
   fi
 elif [ -f $data/feats.scp ]; then
   echo "$0: wave file does not exist so getting durations from feats files"
diff --git a/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
new file mode 100755
index 00000000000..8d810c68fe1
--- /dev/null
+++ b/egs/wsj/s5/utils/data/internal/combine_segments_to_recording.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+# Copyright 2018  Vimal Manohar
+# Apache 2.0
+
+from __future__ import print_function
+import argparse
+import sys
+import collections
+from collections import defaultdict
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""
+        This script combines segments into utterances at
+        recording-level and write out new utt2spk file with reco-id as the
+        speakers. If --write-reco2utt is provided, it writes a mapping from
+        recording-id to the list of utterances sorted by start and end times.
+        This map can be used to combine text corresponding to the segments to
+        recording-level.""")
+
+    parser.add_argument("--write-reco2utt", help="If provided, writes a "
+                        "mapping from recording-id to list of utterances "
+                        "sorted by start and end times.")
+    parser.add_argument("segments_in", help="Input segments file")
+    parser.add_argument("utt2spk_out", help="Output utt2spk file")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    args = get_args()
+
+    utt2reco = {}
+    segments_for_reco = defaultdict(list)
+    for line in open(args.segments_in):
+        parts = line.strip().split()
+
+        if len(parts) < 4:
+            raise TypeError("bad line in segments file {}".format(line))
+
+        utt = parts[0]
+        reco = parts[1]
+        start_time = parts[2]
+        end_time = parts[3]
+
+        segments_for_reco[reco].append((utt, start_time, end_time))
+        utt2reco[utt] = reco
+
+    if args.write_reco2utt is not None:
+        with open(args.write_reco2utt, 'w') as reco2utt_writer, \
+                open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco, segments_in_reco in segments_for_reco.items():
+                utts = ' '.join([seg[0] for seg in sorted(
+                    segments_in_reco, key=lambda x:(x[1], x[2]))])
+                print("{0} {1}".format(reco, utts), file=reco2utt_writer)
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+    else:
+        with open(args.utt2spk_out, 'w') as utt2spk_writer:
+            for reco in segments_for_reco.keys():
+                print ("{0} {0}".format(reco), file=utt2spk_writer)
+
+
+if __name__ == "__main__":
+    main()

From 6d32a0bbf435ab4994f43cbe3e0b91b74c679e25 Mon Sep 17 00:00:00 2001
From: Allen X <allenx@users.noreply.github.com>
Date: Thu, 15 Feb 2018 02:58:49 +0800
Subject: [PATCH 128/184] [src] Fix usage message, comment of ali-to-pdf
 (#2218)

---
 src/bin/ali-to-pdf.cc      | 2 +-
 src/doc/tree_externals.dox | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bin/ali-to-pdf.cc b/src/bin/ali-to-pdf.cc
index 2b190853c94..61b5138cf31 100644
--- a/src/bin/ali-to-pdf.cc
+++ b/src/bin/ali-to-pdf.cc
@@ -34,7 +34,7 @@ int main(int argc, char *argv[]) {
         "Converts alignments (containing transition-ids) to pdf-ids, zero-based.\n"
         "Usage:  ali-to-pdf  [options] <model> <alignments-rspecifier> <pdfs-wspecifier>\n"
         "e.g.: \n"
-        " ali-to-pdf 1.mdl ark:1.ali ark, t:-\n";
+        " ali-to-pdf 1.mdl ark:1.ali ark,t:-\n";
     ParseOptions po(usage);
 
     po.Read(argc, argv);
diff --git a/src/doc/tree_externals.dox b/src/doc/tree_externals.dox
index df9f96e8430..01f8dc433bf 100644
--- a/src/doc/tree_externals.dox
+++ b/src/doc/tree_externals.dox
@@ -158,7 +158,7 @@ the questions configuration class would be output by the compile-questions
 program, which takes in a topology list of phonetic questions (in our
 scripts, these are automatically obtained from tree-building statistics
 by the program cluster-phones.  The roots file specifies sets of phones
-that are goint to have shared roots in the decision-tree clustering process, and says
+that are going to have shared roots in the decision-tree clustering process, and says
 for each phone set the following two things:
 
   - "shared" or "not-shared" says whether or not there should be separate roots

From bea48383da4e0e0916d34329db2f9172237d3c9a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 15 Feb 2018 22:31:33 -0500
Subject: [PATCH 129/184] [egs] Clean up locations of example scripts

---
 egs/swbd/s5c/local/chain/run_tdnn.sh          |   2 +-
 egs/swbd/s5c/local/chain/run_tdnn_lstm.sh     |   2 +-
 .../s5c/local/chain/tuning/run_tdnn_7m19b.sh  | 368 -----------
 .../s5c/local/chain/tuning/run_tdnn_7m19c.sh  | 370 -----------
 .../s5c/local/chain/tuning/run_tdnn_7m19h.sh  | 428 -------------
 .../s5c/local/chain/tuning/run_tdnn_7m19m.sh  | 473 --------------
 .../s5c/local/chain/tuning/run_tdnn_7m23b.sh  | 482 ---------------
 .../s5c/local/chain/tuning/run_tdnn_7m23b2.sh | 501 ---------------
 .../s5c/local/chain/tuning/run_tdnn_7m23h.sh  | 519 ----------------
 .../s5c/local/chain/tuning/run_tdnn_7m23t.sh  | 541 ----------------
 .../s5c/local/chain/tuning/run_tdnn_7m25d.sh  | 571 -----------------
 .../s5c/local/chain/tuning/run_tdnn_7m25e.sh  | 556 -----------------
 .../s5c/local/chain/tuning/run_tdnn_7m25f.sh  | 575 ------------------
 .../s5c/local/chain/tuning/run_tdnn_7n.sh     | 274 +++++++++
 ..._tdnn_lstm_1m13.sh => run_tdnn_lstm_1n.sh} |  42 +-
 egs/wsj/s5/local/chain/run_tdnn_lstm.sh       |   2 +-
 ...tdnn_lstm_1b22c.sh => run_tdnn_lstm_1b.sh} |  46 +-
 17 files changed, 324 insertions(+), 5428 deletions(-)
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
 delete mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
 rename egs/swbd/s5c/local/chain/tuning/{run_tdnn_lstm_1m13.sh => run_tdnn_lstm_1n.sh} (89%)
 rename egs/wsj/s5/local/chain/tuning/{run_tdnn_lstm_1b22c.sh => run_tdnn_lstm_1b.sh} (95%)

diff --git a/egs/swbd/s5c/local/chain/run_tdnn.sh b/egs/swbd/s5c/local/chain/run_tdnn.sh
index a9a47fd3990..4a39dfb66ac 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_7m.sh
\ No newline at end of file
+tuning/run_tdnn_7n.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
index fbc28248491..aabb9e18659 120000
--- a/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1e.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1n.sh
\ No newline at end of file
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
deleted file mode 100755
index fdc4b63d59b..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19b.sh
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/bin/bash
-# TODO: this will be moved before merging to master.
-
-# 7m19b is as 7m19 but with some bypass connections.  Helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m10_sp tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.77     13.09     12.93
-# WER on train_dev(fg)      12.65     12.12     11.87
-# WER on eval2000(tg)        16.1      15.8      15.6
-# WER on eval2000(fg)        14.3      14.3      14.0
-# WER on rt03(tg)            19.9      19.1      19.0
-# WER on rt03(fg)            17.4      16.6      16.4
-# Final train prob         -0.111    -0.096    -0.096
-# Final valid prob         -0.120    -0.106    -0.106
-# Final train prob (xent)        -1.314    -1.198    -1.188
-# Final valid prob (xent)       -1.3247   -1.2070   -1.1980
-# Num-parameters               13361700  15528996  16512036
-
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m19b
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3,tdnn4) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3,tdnn6) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3,tdnn8) dim=1280 bottleneck-dim=256
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn10 $opts dim=1280 bottleneck-dim=256
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn10 $opts dim=1280 bottleneck-dim=256
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
deleted file mode 100755
index 5fe29ac3562..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19c.sh
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/bin/bash
-# Note: before merging to master, this will be renamed.
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#   Effect is unclear.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            19.1      19.0      19.1
-# WER on rt03(fg)            16.6      16.4      16.6
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m19c
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1280 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1280 bottleneck-dim=256
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1280 bottleneck-dim=256
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1280 bottleneck-dim=256
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
deleted file mode 100755
index 9ce9a790e2f..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19h.sh
+++ /dev/null
@@ -1,428 +0,0 @@
-#!/bin/bash
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp  [rerun of 17m19h:]
-# WER on train_dev(tg)      12.75     12.65     12.61
-# WER on train_dev(fg)      11.77     11.57     11.72
-# WER on eval2000(tg)        15.5      15.3     15.4
-# WER on eval2000(fg)        14.0      13.7     13.7
-# WER on rt03(tg)            18.9      18.8     18.9
-# WER on rt03(fg)            16.4      16.4     16.3
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m19h
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9) dim=1536 bottleneck-dim=256
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
deleted file mode 100755
index ed533b7da29..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m19m.sh
+++ /dev/null
@@ -1,473 +0,0 @@
-#!/bin/bash
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m19m
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1536 bottleneck-dim=192
-  linear-component name=tdnn1l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn3 $opts dim=1536 bottleneck-dim=192
-  linear-component name=tdnn3l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(-1,0,1) dim=1536 bottleneck-dim=256
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4, tdnn2) bottleneck-dim=192
-  linear-component name=tdnn5l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  linear-component name=tdnn6l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(-3,0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536 bottleneck-dim=256
-  linear-component name=tdnn7l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  linear-component name=tdnn8l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(-3,0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536 bottleneck-dim=256
-  linear-component name=tdnn9l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(-3,0,3) dim=1536 bottleneck-dim=256
-  linear-component name=tdnn10l dim=512 $linear_opts
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(-3,0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536 bottleneck-dim=256
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536 bottleneck-dim=256
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536 bottleneck-dim=256
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
deleted file mode 100755
index 7b0f45e6899..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b.sh
+++ /dev/null
@@ -1,482 +0,0 @@
-#!/bin/bash
-
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages...
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m23b
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1536
-  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn2l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1536
-  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn4l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192
-  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11 $opts dim=1536
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11 $opts dim=1536
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh
deleted file mode 100755
index 9f943cf7d4d..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23b2.sh
+++ /dev/null
@@ -1,501 +0,0 @@
-#!/bin/bash
-
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m23b2
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1536
-  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn2l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1536
-  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn4l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l) bottleneck-dim=192
-  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
-  linear-component name=tdnn11l dim=256 $linear_opts
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh
deleted file mode 100755
index 7761cb1c24e..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23h.sh
+++ /dev/null
@@ -1,519 +0,0 @@
-#!/bin/bash
-
-# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
-# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
-# with dropout, to show that we're doing better than that now.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# WER on train_dev(tg)      12.33     12.38     12.28
-# WER on train_dev(fg)      11.42     11.44     11.21
-# WER on eval2000(tg)        15.2      15.1      15.0
-# WER on eval2000(fg)        13.8      13.6      13.5
-# WER on rt03(tg)            18.6      18.4      18.5
-# WER on rt03(fg)            16.3      16.1      16.1
-# Final train prob         -0.082    -0.084    -0.083
-# Final valid prob         -0.099    -0.098    -0.097
-# Final train prob (xent)        -0.959    -1.049    -1.036
-# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
-# Num-parameters               39558436  23120164  23513380
-#
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m23h
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1536
-  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn2l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1536
-  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1536
-  linear-component name=tdnn4l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(tdnn4l, tdnn2l)
-  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1536
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1536
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1536
-  linear-component name=tdnn11l dim=256 $linear_opts
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1536
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1536
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 4 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
deleted file mode 100755
index 08f85ef77cb..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m23t.sh
+++ /dev/null
@@ -1,541 +0,0 @@
-#!/bin/bash
-
-# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
-# Differernce vs. 23r is unclear (maybe slightly worse), but it
-# seems slightly better than 23h, and it's nice that it has fewer parameters.
-
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# WER on train_dev(tg)      12.28     11.95     12.18
-# WER on train_dev(fg)      11.21     10.97     11.12
-# WER on eval2000(tg)        15.0      15.0      14.9
-# WER on eval2000(fg)        13.5      13.6      13.5
-# WER on rt03(tg)            18.5      18.4      18.4
-# WER on rt03(fg)            16.1      15.9      16.2
-# Final train prob         -0.083    -0.076    -0.077
-# Final valid prob         -0.097    -0.091    -0.093
-# Final train prob (xent)        -1.036    -0.978    -0.994
-# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
-# Num-parameters               23513380  23513380  20111396
-
-# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
-# had 3 epochs.
-
-# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
-# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
-# with dropout, to show that we're doing better than that now.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# WER on train_dev(tg)      12.33     12.38     12.28
-# WER on train_dev(fg)      11.42     11.44     11.21
-# WER on eval2000(tg)        15.2      15.1      15.0
-# WER on eval2000(fg)        13.8      13.6      13.5
-# WER on rt03(tg)            18.6      18.4      18.5
-# WER on rt03(fg)            16.3      16.1      16.1
-# Final train prob         -0.082    -0.084    -0.083
-# Final valid prob         -0.099    -0.098    -0.097
-# Final train prob (xent)        -0.959    -1.049    -1.036
-# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
-# Num-parameters               39558436  23120164  23513380
-#
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m23t
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn4l, tdnn2l)
-  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1280
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1280
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn9l,tdnn7l,tdnn5l) dim=1280
-  linear-component name=tdnn11l dim=256 $linear_opts
-
-  relu-batchnorm-layer name=prefinal-chain input=tdnn11l $opts dim=1280
-  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=tdnn11l $opts dim=1280
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 6 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
deleted file mode 100755
index ec0f1f4dc87..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25d.sh
+++ /dev/null
@@ -1,571 +0,0 @@
-#!/bin/bash
-
-# 7m25d is as 7m25c but reverting to sharing the linear layer before the
-# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
-# to be from a layer that wasn't otherwise used as splicing input.
-# Maybe slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25a_sp tdnn7m25b_sp tdnn7m25c_sp tdnn7m25d_sp
-# System                tdnn7m23t_sp tdnn7m25a_sp tdnn7m25b_sp tdnn7m25c_sp tdnn7m25d_sp
-# WER on train_dev(tg)      12.18     11.99     12.09     12.06     12.10
-# WER on train_dev(fg)      11.12     11.20     11.12     11.06     11.10
-# WER on eval2000(tg)        14.9      15.3      15.2      14.9      14.9
-# WER on eval2000(fg)        13.5      13.8      13.6      13.5      13.5
-# WER on rt03(tg)            18.4      18.4      18.3      18.5      18.2
-# WER on rt03(fg)            16.2      16.1      15.9      16.2      16.0
-# Final train prob         -0.077    -0.077    -0.078    -0.077    -0.077
-# Final valid prob         -0.093    -0.093    -0.094    -0.092    -0.092
-# Final train prob (xent)        -0.994    -0.990    -0.996    -0.977    -0.986
-# Final valid prob (xent)       -1.0194   -1.0286   -1.0198   -1.0114   -1.0111
-# Num-parameters               20111396  20439076  20439076  23128356  22735140
-
-
-# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
-#  increasing dim from 1280 to 1536.
-# 7m25b is as 7m25a but with slightly different skip connections,
-#  so all layers are the sources of skip connections.  (Also see 7m23u, although
-#  that experiment didn't give clear results).
-# 7m25a is as 7m23t but with some renamings of layers to make it more
-# understandable, and changing how the last layer is done (there's now a little
-# bit less sharing).
-
-# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
-# Differernce vs. 23r is unclear (maybe slightly worse), but it
-# seems slightly better than 23h, and it's nice that it has fewer parameters.
-
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# WER on train_dev(tg)      12.28     11.95     12.18
-# WER on train_dev(fg)      11.21     10.97     11.12
-# WER on eval2000(tg)        15.0      15.0      14.9
-# WER on eval2000(fg)        13.5      13.6      13.5
-# WER on rt03(tg)            18.5      18.4      18.4
-# WER on rt03(fg)            16.1      15.9      16.2
-# Final train prob         -0.083    -0.076    -0.077
-# Final valid prob         -0.097    -0.091    -0.093
-# Final train prob (xent)        -1.036    -0.978    -0.994
-# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
-# Num-parameters               23513380  23513380  20111396
-
-# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
-# had 3 epochs.
-
-# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
-# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
-# with dropout, to show that we're doing better than that now.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# WER on train_dev(tg)      12.33     12.38     12.28
-# WER on train_dev(fg)      11.42     11.44     11.21
-# WER on eval2000(tg)        15.2      15.1      15.0
-# WER on eval2000(fg)        13.8      13.6      13.5
-# WER on rt03(tg)            18.6      18.4      18.5
-# WER on rt03(fg)            16.3      16.1      16.1
-# Final train prob         -0.082    -0.084    -0.083
-# Final valid prob         -0.099    -0.098    -0.097
-# Final train prob (xent)        -0.959    -1.049    -1.036
-# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
-# Num-parameters               39558436  23120164  23513380
-#
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m25d
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn5l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l)
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1536
-  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536
-  linear-component name=prefinal-l dim=256 $linear_opts
-
-  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
-  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.add-option="--optimization.memory-compression-level=2" \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 6 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
deleted file mode 100755
index 3247de70eaa..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25e.sh
+++ /dev/null
@@ -1,556 +0,0 @@
-#!/bin/bash
-
-# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
-
-# 7m25d is as 7m25c but reverting to sharing the linear layer before the
-# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
-# to be from a layer that wasn't otherwise used as splicing input.
-
-# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
-#  increasing dim from 1280 to 1536.
-# 7m25b is as 7m25a but with slightly different skip connections,
-#  so all layers are the sources of skip connections.  (Also see 7m23u, although
-#  that experiment didn't give clear results).
-# 7m25a is as 7m23t but with some renamings of layers to make it more
-# understandable, and changing how the last layer is done (there's now a little
-# bit less sharing).
-
-# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
-# Differernce vs. 23r is unclear (maybe slightly worse), but it
-# seems slightly better than 23h, and it's nice that it has fewer parameters.
-
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# WER on train_dev(tg)      12.28     11.95     12.18
-# WER on train_dev(fg)      11.21     10.97     11.12
-# WER on eval2000(tg)        15.0      15.0      14.9
-# WER on eval2000(fg)        13.5      13.6      13.5
-# WER on rt03(tg)            18.5      18.4      18.4
-# WER on rt03(fg)            16.1      15.9      16.2
-# Final train prob         -0.083    -0.076    -0.077
-# Final valid prob         -0.097    -0.091    -0.093
-# Final train prob (xent)        -1.036    -0.978    -0.994
-# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
-# Num-parameters               23513380  23513380  20111396
-
-# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
-# had 3 epochs.
-
-# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
-# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
-# with dropout, to show that we're doing better than that now.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# WER on train_dev(tg)      12.33     12.38     12.28
-# WER on train_dev(fg)      11.42     11.44     11.21
-# WER on eval2000(tg)        15.2      15.1      15.0
-# WER on eval2000(fg)        13.8      13.6      13.5
-# WER on rt03(tg)            18.6      18.4      18.5
-# WER on rt03(fg)            16.3      16.1      16.1
-# Final train prob         -0.082    -0.084    -0.083
-# Final valid prob         -0.099    -0.098    -0.097
-# Final train prob (xent)        -0.959    -1.049    -1.036
-# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
-# Num-parameters               39558436  23120164  23513380
-#
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m25e
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn5l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l)
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280
-  linear-component name=prefinal-l dim=256 $linear_opts
-
-  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
-  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
-
-  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 6 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh
deleted file mode 100755
index 669c55a7f56..00000000000
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25f.sh
+++ /dev/null
@@ -1,575 +0,0 @@
-#!/bin/bash
-
-# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments.
-#  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25e_sp tdnn7m25f_sp
-# System                tdnn7m23t_sp tdnn7m25e_sp tdnn7m25f_sp
-# WER on train_dev(tg)      12.18     12.27     12.44
-# WER on train_dev(fg)      11.12     11.24     11.22
-# WER on eval2000(tg)        14.9      15.2      15.0
-# WER on eval2000(fg)        13.5      13.6      13.5
-# WER on rt03(tg)            18.4      18.4      18.0
-# WER on rt03(fg)            16.2      16.1      15.6
-# Final train prob         -0.077    -0.078    -0.084
-# Final valid prob         -0.093    -0.092    -0.097
-# Final train prob (xent)        -0.994    -1.006    -1.050
-# Final valid prob (xent)       -1.0194   -1.0270   -1.0592
-# Num-parameters               20111396  20111396  20111396
-#
-# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
-
-# 7m25d is as 7m25c but reverting to sharing the linear layer before the
-# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
-# to be from a layer that wasn't otherwise used as splicing input.
-
-# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
-#  increasing dim from 1280 to 1536.
-# 7m25b is as 7m25a but with slightly different skip connections,
-#  so all layers are the sources of skip connections.  (Also see 7m23u, although
-#  that experiment didn't give clear results).
-# 7m25a is as 7m23t but with some renamings of layers to make it more
-# understandable, and changing how the last layer is done (there's now a little
-# bit less sharing).
-
-# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
-# Differernce vs. 23r is unclear (maybe slightly worse), but it
-# seems slightly better than 23h, and it's nice that it has fewer parameters.
-
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
-# WER on train_dev(tg)      12.28     11.95     12.18
-# WER on train_dev(fg)      11.21     10.97     11.12
-# WER on eval2000(tg)        15.0      15.0      14.9
-# WER on eval2000(fg)        13.5      13.6      13.5
-# WER on rt03(tg)            18.5      18.4      18.4
-# WER on rt03(fg)            16.1      15.9      16.2
-# Final train prob         -0.083    -0.076    -0.077
-# Final valid prob         -0.097    -0.091    -0.093
-# Final train prob (xent)        -1.036    -0.978    -0.994
-# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
-# Num-parameters               23513380  23513380  20111396
-
-# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
-# had 3 epochs.
-
-# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
-# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
-# with dropout, to show that we're doing better than that now.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
-# WER on train_dev(tg)      12.33     12.38     12.28
-# WER on train_dev(fg)      11.42     11.44     11.21
-# WER on eval2000(tg)        15.2      15.1      15.0
-# WER on eval2000(fg)        13.8      13.6      13.5
-# WER on rt03(tg)            18.6      18.4      18.5
-# WER on rt03(fg)            16.3      16.1      16.1
-# Final train prob         -0.082    -0.084    -0.083
-# Final valid prob         -0.099    -0.098    -0.097
-# Final train prob (xent)        -0.959    -1.049    -1.036
-# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
-# Num-parameters               39558436  23120164  23513380
-#
-# 7m23b2 is as 7m23b but fixing an issue at the last layers.
-# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
-#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
-# WER is slightly better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
-# WER on train_dev(tg)      12.55     12.23     12.38
-# WER on train_dev(fg)      11.52     11.29     11.44
-# WER on eval2000(tg)        15.2      15.2      15.1
-# WER on eval2000(fg)        13.6      13.7      13.6
-# WER on rt03(tg)            18.6      18.7      18.4
-# WER on rt03(fg)            16.2      16.3      16.1
-# Final train prob         -0.089    -0.083    -0.084
-# Final valid prob         -0.101    -0.097    -0.098
-# Final train prob (xent)        -1.080    -1.025    -1.049
-# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
-# Num-parameters               21055012  23120164  23120164
-
-
-# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
-#  reducing the dim of the linear components... it's basically an attempt to
-#  reverse the factorization to have the splicing at a different point.
-#
-
-# 7m19m is as 7m19l but with more skip connections
-#   Hm-- seems better than 19h.
-#
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
-# WER on train_dev(tg)      12.61     12.72     12.55
-# WER on train_dev(fg)      11.72     11.62     11.52
-# WER on eval2000(tg)        15.4      15.4      15.2
-# WER on eval2000(fg)        13.7      13.8      13.6
-# WER on rt03(tg)            18.9      18.9      18.6
-# WER on rt03(fg)            16.3      16.4      16.2
-# Final train prob         -0.091    -0.091    -0.089
-# Final valid prob         -0.102    -0.103    -0.101
-# Final train prob (xent)        -1.098    -1.095    -1.080
-# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
-# Num-parameters               21055012  20268580  21055012
-#
-# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
-# doing the Append... doing this by inserting a linear-component between
-# pairs of relu-batchnorm-layers.
-#  A little worse.
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
-# System                tdnn7m19h_sp tdnn7m19l_sp
-# WER on train_dev(tg)      12.65     12.72
-# WER on train_dev(fg)      11.57     11.62
-# WER on eval2000(tg)        15.3      15.4
-# WER on eval2000(fg)        13.7      13.8
-# WER on rt03(tg)            18.8      18.9
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.091    -0.091
-# Final valid prob         -0.102    -0.103
-# Final train prob (xent)        -1.091    -1.095
-# Final valid prob (xent)       -1.1064   -1.1191
-# Num-parameters               21055012  20268580
-
-
-# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
-# System                tdnn7m19e_sp tdnn7m19h_sp
-# WER on train_dev(tg)      12.75     12.65
-# WER on train_dev(fg)      11.77     11.57
-# WER on eval2000(tg)        15.5      15.3
-# WER on eval2000(fg)        14.0      13.7
-# WER on rt03(tg)            18.9      18.8
-# WER on rt03(fg)            16.4      16.4
-# Final train prob         -0.092    -0.091
-# Final valid prob         -0.102    -0.102
-# Final train prob (xent)        -1.094    -1.091
-# Final valid prob (xent)       -1.1095   -1.1064
-# Num-parameters               20760100  21055012
-
-# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
-# WER on train_dev(tg)      13.77     12.86     13.01     12.75
-# WER on train_dev(fg)      12.65     11.82     12.02     11.77
-# WER on eval2000(tg)        16.1      15.4      15.7      15.5
-# WER on eval2000(fg)        14.3      13.8      14.0      14.0
-# WER on rt03(tg)            19.9      19.1      19.2      18.9
-# WER on rt03(fg)            17.4      16.6      16.7      16.4
-# Final train prob         -0.111    -0.094    -0.096    -0.092
-# Final valid prob         -0.120    -0.103    -0.105    -0.102
-# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
-# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
-# Num-parameters               13361700  17824036  14887972  20760100
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
-# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
-# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
-# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
-# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
-# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
-# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
-# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
-# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
-# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
-# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
-# Num-parameters               14216996  15528996  16512036  17824036  14887972
-
-# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
-#  Seems about 0.1% better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
-# WER on train_dev(tg)      13.09     12.93     12.86
-# WER on train_dev(fg)      12.12     11.87     11.82
-# WER on eval2000(tg)        15.8      15.6      15.4
-# WER on eval2000(fg)        14.3      14.0      13.8
-# WER on rt03(tg)            14.8      14.9      14.8
-# WER on rt03(fg)            12.4      12.5      12.5
-# Final train prob         -0.096    -0.096    -0.094
-# Final valid prob         -0.106    -0.106    -0.103
-# Final train prob (xent)        -1.198    -1.188    -1.117
-# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
-# Num-parameters               15528996  16512036  17824036
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
-# System                tdnn7m19_sp tdnn7m19b_sp
-# WER on train_dev(tg)      13.09     12.93
-# WER on train_dev(fg)      12.12     11.87
-# WER on eval2000(tg)        15.8      15.6
-# WER on eval2000(fg)        14.3      14.0
-# WER on rt03(tg)            14.8      14.9
-# WER on rt03(fg)            12.4      12.5
-# Final train prob         -0.096    -0.096
-# Final valid prob         -0.106    -0.106
-# Final train prob (xent)        -1.198    -1.188
-# Final valid prob (xent)       -1.2070   -1.1980
-# Num-parameters               15528996  16512036
-
-# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
-# CAUTION: messing with queue opts.
-# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
-# WER on train_dev(tg)      13.58     13.50     13.37
-# WER on train_dev(fg)      12.43     12.44     12.47
-# WER on eval2000(tg)        16.0      16.0      15.8
-# WER on eval2000(fg)        14.3      14.3      14.3
-# WER on rt03(tg)            15.2      15.4      15.1
-# WER on rt03(fg)            13.0      13.0      12.7
-# Final train prob         -0.109    -0.111    -0.099
-# Final valid prob         -0.117    -0.119    -0.110
-# Final train prob (xent)        -1.278    -1.291    -1.302
-# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
-# Num-parameters               16089380  14216996  14216996
-
-# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
-#   384 to 256 (like 11->14).
-# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
-# Seems a little better but could be due to the increase in parameters.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
-# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
-# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
-# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
-# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
-# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
-# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
-# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
-# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
-# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
-# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
-# Num-parameters               11580452  13818148  13361700  13809188  16089380
-
-# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
-# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
-# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
-# System                tdnn7m8_sp tdnn7m9_sp
-# WER on train_dev(tg)      13.60     13.88
-# WER on train_dev(fg)      12.62     12.64
-# WER on eval2000(tg)        16.8      16.1
-# WER on eval2000(fg)        15.4      14.4
-# WER on rt03(tg)            16.2      15.5
-# WER on rt03(fg)            13.7      13.1
-# Final train prob         -0.105    -0.111
-# Final valid prob         -0.115    -0.119
-# Final train prob (xent)        -1.282    -1.309
-# Final valid prob (xent)       -1.3194   -1.3246
-# Num-parameters               11580452  13818148
-
-# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
-# is the same as 7m2->7m3, which was helpful there.
-#  Does seem helpful.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
-# WER on train_dev(tg)      13.70     13.74     13.81     13.60
-# WER on train_dev(fg)      12.67     12.76     12.74     12.62
-# WER on eval2000(tg)        16.6      17.1      17.0      16.8
-# WER on eval2000(fg)        15.1      15.4      15.4      15.4
-# WER on rt03(tg)            16.1      16.2      16.0      16.2
-# WER on rt03(fg)            13.7      13.8      13.6      13.7
-# Final train prob         -0.085    -0.106    -0.104    -0.105
-# Final valid prob         -0.103    -0.118    -0.116    -0.115
-# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
-# Num-parameters               16292693  10924836  11580452  11580452
-
-
-# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
-# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
-#  but not clear at all... for consistency with other setups I may retain
-#  this change.
-
-# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
-# WER on train_dev(tg)      13.70     13.74     13.71     13.81
-# WER on train_dev(fg)      12.67     12.76     12.64     12.74
-# WER on eval2000(tg)        16.6      17.1      16.8      17.0
-# WER on eval2000(fg)        15.1      15.4      15.1      15.4
-# WER on rt03(tg)            16.1      16.2      16.2      16.0
-# WER on rt03(fg)            13.7      13.8      13.8      13.6
-# Final train prob         -0.085    -0.106    -0.103    -0.104
-# Final valid prob         -0.103    -0.118    -0.114    -0.116
-# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
-# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
-# Num-parameters               16292693  10924836  12170788  11580452
-
-
-# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
-#  and the prefinal layers from 512 to 768.
-# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
-# 7m is as 7k but adding two non-splicing layers towards the beginning of the
-#   network.
-# The impovement is pretty small but I've seen similar improvements on other
-# setups with this architecture so I tend to believe it.
-
-
-# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
-# System                tdnn_7k_sp tdnn_7m_sp
-# WER on train_dev(tg)      13.83     13.65
-# WER on train_dev(fg)      12.74     12.54
-# WER on eval2000(tg)        16.9      16.8
-# WER on eval2000(fg)        15.2      15.1
-# Final train prob         -0.085    -0.084
-# Final valid prob         -0.107    -0.103
-# Final train prob (xent)        -1.267    -1.215
-# Final valid prob (xent)       -1.3107   -1.2735
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
-# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
-
-set -e
-
-# configs for 'chain'
-stage=0
-train_stage=-10
-get_egs_stage=-10
-speed_perturb=true
-affix=7m25f
-suffix=
-$speed_perturb && suffix=_sp
-if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
-
-dir=exp/chain/tdnn${affix}${suffix}
-decode_iter=
-decode_nj=50
-
-# training options
-frames_per_eg=150,110,100
-remove_egs=false
-common_egs_dir=
-xent_regularize=0.1
-dropout_schedule='0,0@0.20,0.3@0.50,0'
-
-test_online_decoding=false  # if true, it will run the last decoding stage.
-
-# End configuration section.
-echo "$0 $@"  # Print the command line for logging
-
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
-
-if ! cuda-compiled; then
-  cat <<EOF && exit 1
-This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
-If you want to use GPUs (and have them), go to src/, and configure and make on a machine
-where "nvcc" is installed.
-EOF
-fi
-
-# The iVector-extraction and feature-dumping parts are the same as the standard
-# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
-# run those things.
-
-train_set=train_nodup$suffix
-ali_dir=exp/tri4_ali_nodup$suffix
-treedir=exp/chain/tri5_7d_tree$suffix
-lang=data/lang_chain_2y
-
-
-# if we are using the speed-perturbed data we need to generate
-# alignments for it.
-local/nnet3/run_ivector_common.sh --stage $stage \
-  --speed-perturb $speed_perturb \
-  --generate-alignments $speed_perturb || exit 1;
-
-
-if [ $stage -le 9 ]; then
-  # Get the alignments as lattices (gives the LF-MMI training more freedom).
-  # use the same num-jobs as the alignments
-  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
-    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
-  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
-fi
-
-
-if [ $stage -le 10 ]; then
-  # Create a version of the lang/ directory that has one state per phone in the
-  # topo file. [note, it really has two states.. the first one is only repeated
-  # once, the second one has zero or more repeats.]
-  rm -rf $lang
-  cp -r data/lang $lang
-  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
-  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
-  # Use our special topology... note that later on may have to tune this
-  # topology.
-  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
-fi
-
-if [ $stage -le 11 ]; then
-  # Build a tree using our new topology. This is the critically different
-  # step compared with other recipes.
-  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
-      --context-opts "--context-width=2 --central-position=1" \
-      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
-fi
-
-if [ $stage -le 12 ]; then
-  echo "$0: creating neural net configs using the xconfig parser";
-
-  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
-  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
-  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true"
-  linear_opts="orthonormal-constraint=1.0"
-  output_opts="l2-regularize=0.0005"
-
-  mkdir -p $dir/configs
-
-  cat <<EOF > $dir/configs/network.xconfig
-  input dim=100 name=ivector
-  input dim=40 name=input
-
-  # please note that it is important to have input layer with the name=input
-  # as the layer immediately preceding the fixed-affine-layer to enable
-  # the use of short notation for the descriptor
-  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
-
-  # the first splicing is moved before the lda layer, so no splicing here
-  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts
-  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
-  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn5l dim=256 $linear_opts
-  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1280 input=Append(0, tdnn3l)
-  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
-  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1280
-  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1280
-  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1280
-  linear-component name=prefinal-l dim=256 $linear_opts
-
-  relu-batchnorm-dropout-layer name=prefinal-chain input=prefinal-l $opts dim=1280
-  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
-
-  relu-batchnorm-dropout-layer name=prefinal-xent input=prefinal-l $opts dim=1280
-  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
-EOF
-  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
-fi
-
-if [ $stage -le 13 ]; then
-  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
-    utils/create_split_dir.pl \
-     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
-  fi
-
-  steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
-    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
-    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
-    --chain.xent-regularize $xent_regularize \
-    --chain.leaky-hmm-coefficient 0.1 \
-    --chain.l2-regularize 0.0 \
-    --chain.apply-deriv-weights false \
-    --chain.lm-opts="--num-extra-lm-states=2000" \
-    --trainer.dropout-schedule $dropout_schedule \
-    --egs.dir "$common_egs_dir" \
-    --egs.stage $get_egs_stage \
-    --egs.opts "--frames-overlap-per-eg 0" \
-    --egs.chunk-width $frames_per_eg \
-    --trainer.num-chunk-per-minibatch 128 \
-    --trainer.frames-per-iter 1500000 \
-    --trainer.num-epochs 6 \
-    --trainer.optimization.num-jobs-initial 3 \
-    --trainer.optimization.num-jobs-final 16 \
-    --trainer.optimization.initial-effective-lrate 0.001 \
-    --trainer.optimization.final-effective-lrate 0.0001 \
-    --trainer.max-param-change 2.0 \
-    --cleanup.remove-egs $remove_egs \
-    --feat-dir data/${train_set}_hires \
-    --tree-dir $treedir \
-    --lat-dir exp/tri4_lats_nodup$suffix \
-    --dir $dir  || exit 1;
-
-fi
-
-if [ $stage -le 14 ]; then
-  # Note: it might appear that this $lang directory is mismatched, and it is as
-  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
-  # the lang directory.
-  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
-fi
-
-
-graph_dir=$dir/graph_sw1_tg
-iter_opts=
-if [ ! -z $decode_iter ]; then
-  iter_opts=" --iter $decode_iter "
-fi
-if [ $stage -le 15 ]; then
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-      (
-      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
-          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
-          $graph_dir data/${decode_set}_hires \
-          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-      ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-if $test_online_decoding && [ $stage -le 16 ]; then
-  # note: if the features change (e.g. you add pitch features), you will have to
-  # change the options of the following command line.
-  steps/online/nnet3/prepare_online_decoding.sh \
-       --mfcc-config conf/mfcc_hires.conf \
-       $lang exp/nnet3/extractor $dir ${dir}_online
-
-  rm $dir/.error 2>/dev/null || true
-  for decode_set in train_dev eval2000 $maybe_rt03; do
-    (
-      # note: we just give it "$decode_set" as it only uses the wav.scp, the
-      # feature type does not matter.
-
-      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
-          --acwt 1.0 --post-decode-acwt 10.0 \
-         $graph_dir data/${decode_set}_hires \
-         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
-      if $has_fisher; then
-          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
-            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
-            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
-      fi
-    ) || touch $dir/.error &
-  done
-  wait
-  if [ -f $dir/.error ]; then
-    echo "$0: something went wrong in decoding"
-    exit 1
-  fi
-fi
-
-
-exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
new file mode 100755
index 00000000000..c2ed666f038
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+
+# 7n is a kind of factorized TDNN, with skip connections.  We have to write
+# a proper description for this.  Note: I'm not happy with how
+
+# The following compares this with our old tdnn_lstm system before kaldi 5.4
+# (from run_tdnn_lstm_1m.sh), and with our old TDNN system.  It's over 1.5%
+# absolute better than our old TDNN system, and even a bit better than our old
+# TDNN+LSTM with dropout.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn_7m_sp tdnn7n_sp
+# WER on train_dev(tg)      12.33     13.70     12.18
+# WER on train_dev(fg)      11.42     12.67     11.12
+# WER on eval2000(tg)        15.2      16.6      14.9
+# WER on eval2000(fg)        13.8      15.1      13.5
+# WER on rt03(tg)            18.6      20.9      18.4
+# WER on rt03(fg)            16.3      18.3      16.2
+# Final train prob         -0.082    -0.085    -0.077
+# Final valid prob         -0.099    -0.103    -0.093
+# Final train prob (xent)        -0.959    -1.230    -0.994
+# Final valid prob (xent)       -1.0305   -1.2704   -1.0194
+# Num-parameters               39558436  16292693  20111396
+
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn7m23t_sp
+# exp/chain/tdnn7m23t_sp: num-iters=394 nj=3..16 num-params=20.1M dim=40+100->6034 combine=-0.083->-0.081 (over 20) xent:train/valid[261,393,final]=(-1.05,-0.991,-0.994/-1.09,-1.02,-1.02) logprob:train/valid[261,393,final]=(-0.085,-0.077,-0.077/-0.100,-0.095,-0.093)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7n
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005 bottleneck-dim=256"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn8l,tdnn6l) dim=1280
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1280
+  output-layer name=output include-log-softmax=false dim=$num_targets $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1280
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 6 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
similarity index 89%
rename from egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh
rename to egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
index fe80244b26d..9cb182b2915 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1m13.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
 
 
-# 1m13 is as 1m but with significant changes, replacing TDNN layers with a
-# structure like run_tdnn_7m23t.sh.  Seems better!
-#
+# 1n is as 1m but with significant changes, replacing TDNN layers with a
+# structure like run_tdnn_7n.sh.  Seems better!  But the improvement
+# versus the best TDNN system (see run_tdnn_7n.sh) is so small that it's
+# not really worth it when you consider how much slower it is.
 
-# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1m13_sp tdnn_lstm1m13_sp_online
-# System                tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1m13_sp tdnn_lstm1m13_sp_online
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online
+# System                tdnn_lstm_1m_ld5_sp tdnn_lstm_1m_ld5_sp_online tdnn_lstm1n_sp tdnn_lstm1n_sp_online
 # WER on train_dev(tg)      12.33     12.21     12.38     12.49
 # WER on train_dev(fg)      11.42     11.41     11.48     11.59
 # WER on eval2000(tg)        15.2      15.1      15.0      14.9
@@ -20,7 +21,8 @@
 # Num-parameters               39558436         0  27773348         0
 #
 
-# exp/chain/tdnn_lstm_1m_ld5_sp: num-iters=262 nj=3..16 num-params=39.6M dim=40+100->6050 combine=-0.091->-0.088 xent:train/valid[173,261,final]=(-1.33,-0.947,-0.950/-1.39,-1.03,-1.03) logprob:train/valid[173,261,final]=(-0.112,-0.080,-0.081/-0.127,-0.100,-0.100)
+
+# exp/chain/tdnn_lstm1n_sp: num-iters=394 nj=3..16 num-params=27.8M dim=40+100->6034 combine=-0.081->-0.080 (over 5) xent:train/valid[261,393,final]=(-1.59,-1.14,-1.15/-1.64,-1.22,-1.22) logprob:train/valid[261,393,final]=(-0.105,-0.086,-0.084/-0.123,-0.107,-0.104)
 
 set -e
 
@@ -29,7 +31,7 @@ stage=0
 train_stage=-10
 get_egs_stage=-10
 speed_perturb=true
-affix=1m13
+affix=1n
 decode_iter=
 decode_dir_affix=
 decode_nj=50
@@ -141,27 +143,27 @@ if [ $stage -le 12 ]; then
 
   # the first splicing is moved before the lda layer, so no splicing here
   relu-batchnorm-layer name=tdnn1 $opts dim=1280
-  linear-component name=tdnn1l dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
   relu-batchnorm-layer name=tdnn2 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn2l dim=256 $linear_opts
+  linear-component name=tdnn3l dim=256 $linear_opts
   relu-batchnorm-layer name=tdnn3 $opts dim=1280
-  linear-component name=tdnn3l dim=256 $linear_opts input=Append(-1,0)
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
   relu-batchnorm-layer name=tdnn4 $opts input=Append(0,1) dim=1280
-  linear-component name=tdnn4l dim=256 $linear_opts
-  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn4l, tdnn2l)
-  linear-component name=tdnn5l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-layer name=tdnn5 $opts dim=1280 input=Append(tdnn5l, tdnn3l)
   linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn6 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm1l dim=256 $linear_opts input=Append(-3,0)
   fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
-  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn5l,tdnn3l,tdnn1l) dim=1280
-  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  relu-batchnorm-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1280
   linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn8 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm2l dim=256 $linear_opts input=Append(-3,0)
   fast-lstmp-layer name=lstm2 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
-  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn7l,tdnn5l,tdnn3l) dim=1280
-  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
-  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  relu-batchnorm-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn4l) dim=1280
   linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-layer name=tdnn10 $opts input=Append(0,3) dim=1280
+  linear-component name=lstm3l dim=256 $linear_opts input=Append(-3,0)
   fast-lstmp-layer name=lstm3 cell-dim=1280 recurrent-projection-dim=256 non-recurrent-projection-dim=128 delay=-3 dropout-proportion=0.0 $lstm_opts
 
   output-layer name=output input=lstm3  include-log-softmax=false $output_opts
diff --git a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
index 8e647598556..a4fa11e0908 120000
--- a/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
+++ b/egs/wsj/s5/local/chain/run_tdnn_lstm.sh
@@ -1 +1 @@
-tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
+tuning/run_tdnn_lstm_1b.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
similarity index 95%
rename from egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh
rename to egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
index 0e5ba084f71..51fefb9ca88 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b22c.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_lstm_1b.sh
@@ -1,24 +1,28 @@
 #!/bin/bash
-#TODO: this needs to be renamed and the comments changed, before merging to master.
-
-# 1b22c is as 1b22 but setting label delay to 8.  It improves on average, even
-# if not everywhere.
-
-# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b21_sp exp/chain/tdnn_lstm1b22_sp exp/chain/tdnn_lstm1b22c_sp
-# System                tdnn_lstm1a_sp tdnn_lstm1b21_sp tdnn_lstm1b22_sp tdnn_lstm1b22c_sp
-#WER dev93 (tgpr)                7.64      7.69      7.47      7.24
-#WER dev93 (tg)                  7.29      7.27      7.14      7.03
-#WER dev93 (big-dict,tgpr)       5.53      5.42      5.31      5.04
-#WER dev93 (big-dict,fg)         5.14      5.04      5.00      4.92
-#WER eval92 (tgpr)               5.62      5.19      5.14      5.23
-#WER eval92 (tg)                 5.30      5.00      4.93      4.78
-#WER eval92 (big-dict,tgpr)      3.62      3.24      3.12      3.17
-#WER eval92 (big-dict,fg)        3.31      2.96      2.73      2.73
-# Final train prob        -0.0344   -0.0470   -0.0401   -0.0403
-# Final valid prob        -0.0518   -0.0587   -0.0527   -0.0526
-# Final train prob (xent)   -0.5589   -0.7782   -0.7484   -0.7406
-# Final valid prob (xent)   -0.6620   -0.8210   -0.7865   -0.7766
-# Num-params                 9106252   4216524   4216524   4216524
+
+
+# 1b is like 1a but instead of having 3 fast-lstm-layers, having one
+#  lstmb-layer.  Caution: although it's better than run_tdnn_lstm_1a.sh, it's
+#  still not better than run_tdnn_1f.sh, and my experience with this LSTMB layer
+#  on larger-scale setups like Switchboard has not been good.  So I *don't
+#  particularly recommend* this setup.
+
+
+# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn_lstm1b_sp
+# System                tdnn_lstm1a_sp  tdnn_lstm1b_sp
+#WER dev93 (tgpr)                7.64       7.24
+#WER dev93 (tg)                  7.29       7.03
+#WER dev93 (big-dict,tgpr)       5.53       5.04
+#WER dev93 (big-dict,fg)         5.14       4.92
+#WER eval92 (tgpr)               5.62       5.23
+#WER eval92 (tg)                 5.30       4.78
+#WER eval92 (big-dict,tgpr)      3.62       3.17
+#WER eval92 (big-dict,fg)        3.31       2.73
+# Final train prob        -0.0344    -0.0403
+# Final valid prob        -0.0518    -0.0526
+# Final train prob (xent)   -0.5589    -0.7406
+# Final valid prob (xent)   -0.6620    -0.7766
+# Num-params                 9106252    4216524
 
 # 1b22 is as 1b21 but setting chain.l2-regularize to zero.
 
@@ -346,7 +350,7 @@ num_threads_ubm=32
 nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
 
 # Options which are not passed through to run_ivector_common.sh
-affix=1b22c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+affix=1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
 common_egs_dir=
 reporting_email=
 

From 34a11558a6a5b6efb7acdc6e10edf109b6c1259a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 15 Feb 2018 23:38:51 -0500
Subject: [PATCH 130/184] [egs] Remove special queue command

---
 egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
index c2ed666f038..cf4855db611 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7n.sh
@@ -175,7 +175,7 @@ if [ $stage -le 13 ]; then
   fi
 
   steps/nnet3/chain/train.py --stage $train_stage \
-    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --cmd "$train_cmd" \
     --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
     --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
     --chain.xent-regularize $xent_regularize \

From f14f1c9fb947f41635ba4e23e3f89e1e0780eb23 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Feb 2018 00:48:45 -0500
Subject: [PATCH 131/184] [src,egs] Various cosmetic and minor fixes

---
 .../s5/local/chain/tuning/run_tdnn_1f.sh       |  4 ++--
 src/cudamatrix/cu-compressed-matrix.h          | 11 +++++++++--
 src/cudamatrix/cu-device.cc                    |  7 +++++--
 src/nnet3/nnet-component-itf.cc                |  5 +++++
 src/nnet3/nnet-computation.h                   | 18 +++++++++---------
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
index 58852b61aa8..9cc6d93022a 100755
--- a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -8,7 +8,7 @@
 #
 # local/chain/compare_wer.sh --online exp/chain/tdnn1{e,f}_sp 2>/dev/null
 # local/chain/compare_wer.sh --online exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
-# System                tdnn1e_sp tdnn1f7_sp
+# System                tdnn1e_sp tdnn1f_sp
 #WER dev_clean_2 (tgsmall)      14.11     13.91
 #             [online:]         14.07     13.96
 #WER dev_clean_2 (tglarge)      10.15      9.95
@@ -20,7 +20,7 @@
 # Num-params                 7508490   4205322
 
 
-# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f7}_sp
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{e,f}_sp
 # exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.057->-0.057 (over 1) xent:train/valid[10,16,final]=(-1.73,-1.46,-1.43/-1.94,-1.72,-1.68) logprob:train/valid[10,16,final]=(-0.067,-0.055,-0.050/-0.105,-0.095,-0.089)
 # exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 2) xent:train/valid[10,16,final]=(-1.60,-1.39,-1.35/-1.81,-1.64,-1.59) logprob:train/valid[10,16,final]=(-0.068,-0.056,-0.051/-0.104,-0.097,-0.092)
 
diff --git a/src/cudamatrix/cu-compressed-matrix.h b/src/cudamatrix/cu-compressed-matrix.h
index 2eafc20c6cc..1ef7853b906 100644
--- a/src/cudamatrix/cu-compressed-matrix.h
+++ b/src/cudamatrix/cu-compressed-matrix.h
@@ -105,8 +105,8 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
   // The raw data.
   I *data_;
 
-  // Scale() affects how the raw data is interpreted as a floating point value.
-  // When uncompressing to a CuMatrix, we'll do
+  // scale_ affects how the raw data is interpreted as a floating point value.
+  // When uncompressing to a CuMatrix, we'll do:
   //  f  = scale_ * i
   // where f is the floating point value we're writing to, and i is the integer
   // value.
@@ -116,6 +116,13 @@ class CuCompressedMatrix: public CuCompressedMatrixBase {
   // that the output becomes -1, 0 and 1.
   BaseFloat scale_;
 
+  // 'truncate_' affects the code that compresses data to integer values.
+  // If the data we're compressing might possibly be outside of the representable
+  // range, then you should set truncate to true (this is the default in the
+  // constructor).  This way, values larger than the minimum or maximum will
+  // be set to the minimum or maximum value.  If truncate_ is false, it will
+  // just wrap around, but the compression code will be slightly faster as
+  // it doesn't need to check.
   bool truncate_;
 
   MatrixIndexT num_rows_;
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 65cb6a120ff..87e266e1889 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -59,11 +59,14 @@ namespace kaldi {
 
 static bool GetCudaContext(int32 num_gpus, std::string *debug_str) {
 
-  // Our first attempt is to just do cudaFree(0) and see if that
-  // returns no error code.
+  // Our first attempt to get a device context is: we do cudaFree(0) and see if
+  // that returns no error code.  If it succeeds then we have a device
+  // context.  Apparently this is the canonical way to get a context.
   if (cudaFree(0) == 0)
     return true;
 
+  // The rest of this code represents how we used to get a device context, but
+  // now its purpose is mainly a debugging one.
   std::ostringstream debug_stream;
   debug_stream << "num-gpus=" << num_gpus << ". ";
   for (int32 device = 0; device < num_gpus; device++) {
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index 0a82a592102..ce4bbd0940a 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -493,6 +493,11 @@ void NonlinearComponent::Read(std::istream &is, bool binary) {
 
   std::string token;
   ReadToken(is, binary, &token);
+  if (token[0] != '<') {
+    // this should happen only rarely, in case we couldn't push back the
+    // '<' to the stream in PeekToken().
+    token = '<' + token;
+  }
   if (token == "<NumDimsSelfRepaired>") {
     ReadBasicType(is, binary, &num_dims_self_repaired_);
     ReadToken(is, binary, &token);
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 0c6c690684a..d056a71498c 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -233,16 +233,16 @@ struct ComputationRequest {
      accepted that argument, even though it doesn't (we fake it using other
      calls, if alpha != 1.0).
    - kCompressMatrix: Compresses the matrix which should be referred to
-       by submatrix-index arg1.  arg2 is a number that determines the
-       compression type (it's converted from the enum CuCompressed
-       MatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha
-       determines the 'range' parameter (c.f. NewCuCompressedMatrix()).
-       arg3 will be converted to the 'truncate' argument to the
-       class CuCompressedMatrix; it should be false (0) if you know that
-       the input is limited to the allowed range, and true (1) if the
-       input may exceed that range (see docs for CuCompresedMatrix).
+     by submatrix-index arg1.  arg2 is a number that determines the
+     compression type (it's converted from the enum
+     CuCompressedMatrixType; 1=int8, 2=uint8, 3=int16, 4=uint16), and alpha
+     determines the 'range' parameter (c.f. NewCuCompressedMatrix()).  arg3
+     will be converted to the 'truncate' argument to the class
+     CuCompressedMatrix; it should be false (0) if you know that the input is
+     limited to the allowed range, and true (1) if the input may exceed that
+     range (see docs for CuCompresedMatrix).
    - kDecompressMatrix:  Decompresses the matrix which is referred to
-      by submatrix-index arg1 (it should previously have been compressed).
+     by submatrix-index arg1 (it should previously have been compressed).
    - kAcceptInput: accepts a matrix of input from the user, which may be either
      features, or derivatives w.r.t. the output.  arg1 is the submatrix index of
      a whole matrix that the input goes to, and arg2 is the index of the network

From 4cd9217a7ccaf1b1470c5c2b697f06bf754ae07a Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Feb 2018 01:04:04 -0500
Subject: [PATCH 132/184] [src] Some fixes to comments

---
 src/nnet3/nnet-chain-training.cc |  8 ++++----
 src/nnet3/nnet-training.cc       |  8 ++++----
 src/nnet3/nnet-utils.cc          | 20 ++++++++++++--------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 31b9f20b785..2080c60077b 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -124,7 +124,7 @@ void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
   ScaleBatchnormStats(nnet_config.batchnorm_stats_scale, nnet_);
 
   // The following will only do something if we have a LinearComponent
-  // with is-constrained-orthonormal set to true.
+  // or AffineComponent with orthonormal-constraint set to a nonzero value.
   ConstrainOrthonormal(nnet_);
 
   // Scale delta_nnet
@@ -179,9 +179,9 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
   if (is_backstitch_step1) {
-    // The following will only do something if we have a LinearComponent
-    // with is-constrained-orthonormal set to true.  We choose to do this
-    // only on the 1st backstitch step, for efficiency.
+    // The following will only do something if we have a LinearComponent or
+    // AffineComponent with orthonormal-constraint set to a nonzero value. We
+    // choose to do this only on the 1st backstitch step, for efficiency.
     ConstrainOrthonormal(nnet_);
   }
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 5b51203c2b0..6bff30c501b 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -116,7 +116,7 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
   ScaleBatchnormStats(config_.batchnorm_stats_scale, nnet_);
 
   // The following will only do something if we have a LinearComponent
-  // with is-constrained-orthonormal set to true.
+  // or AffineComponent with orthonormal-constraint set to a nonzero value.
   ConstrainOrthonormal(nnet_);
 
   // Scale deta_nnet
@@ -170,9 +170,9 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
 
   if (is_backstitch_step1) {
-    // The following will only do something if we have a LinearComponent
-    // with is-constrained-orthonormal set to true.  We choose to do this
-    // only on the 1st backstitch step, for efficiency.
+    // The following will only do something if we have a LinearComponent or
+    // AffineComponent with orthonormal-constraint set to a nonzero value. We
+    // choose to do this only on the 1st backstitch step, for efficiency.
     ConstrainOrthonormal(nnet_);
   }
 
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 59885cf70b2..fd2229cace8 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -865,20 +865,24 @@ void ConstrainOrthonormalInternal(BaseFloat scale, CuMatrixBase<BaseFloat> *M) {
   // Larger alpha will update faster but will be more prone to instability.  I
   // believe the scalar value below shouldn't be more than 0.25 or maybe 0.5 or
   // it will always be unstable.  It should be > 0.0.
-  // The factor of 1/scale^2 is, I *believe*, going to give us the right
-  // kind of invariance w.r.t. the scale.  With regard to this factor, look at
-  // the statement
-  // M_update.AddMatMat(-4.0 * alpha, P, kNoTrans, *M, kNoTrans, 0.0); where P
-  // is proportional to scale^2 and M to 'scale', so the RHS is proportional to
-  // 'scale^3', but we'd like 'M_update' to be proportional to 'scale'.
+
+  // The factor of 1/scale^2 is, I *believe*, going to give us the right kind of
+  // invariance w.r.t. the scale.  To explain why this is the appropriate
+  // factor, look at the statement M_update.AddMatMat(-4.0 * alpha, P, kNoTrans,
+  // *M, kNoTrans, 0.0); where P is proportional to scale^2 and M to 'scale' and
+  // alpha to 1/scale^2, so change in M_update is proportional to 'scale'.
+  // We'd like 'M_update' to be proportional to 'scale'. This reasoning is very
+  // hand-wavey but I think it can be made rigorous.  This is about remaining
+  // stable (not prone to divergence) even for very large or small values of
+  // 'scale'.
   BaseFloat alpha = 0.125 / (scale * scale);
 
-  // We're enforcing the rows to be orthonormal.
+  // We'd like to enforce the rows of M to be orthonormal.
   // define P = M M^T.  If P is unit then M has orthonormal rows.
   // We actually want P to equal scale^2 * I, so that M's rows are
   // orthogonal with 2-norms equal to 'scale'.
   // We (notionally) add to the objective function, the value
-  // -alpha times the sum of squared elements of Q = (P- scale^2 * I).
+  // -alpha times the sum of squared elements of Q = (P - scale^2 * I).
   int32 rows = M->NumRows(), cols = M->NumCols();
   CuMatrix<BaseFloat> M_update(rows, cols);
   CuMatrix<BaseFloat> P(rows, rows);

From b739f427fbe18a4ac22346a47e2b9c1f8dbb26f0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Feb 2018 22:09:45 -0500
Subject: [PATCH 133/184] [src] Small fix to cu-kernels.cu RE compression
 kernel

---
 src/cudamatrix/cu-kernels.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index b0468b7fa7c..ae7e25b716d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3601,7 +3601,7 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
       src_index = i + j * dim.stride;
   const int min_value = minimum_integer_value<I>(),
       max_value = maximum_integer_value<I>();
-  int16_t compressed_value;
+  int compressed_value;
   int ok = (i < dim.cols && j < dim.rows);
   if  (ok) {
     float f = src[src_index];
@@ -3611,7 +3611,6 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
     // range of 'int', and if it fails, we've probably already catastrophically
     // diverged.
     int i = __float2int_rn(f * inv_scale);
-    // note: SignedInt will be int8 or (more likely) int16.
     if (i < min_value) compressed_value = min_value;
     else if (i > max_value) compressed_value = max_value;
     else compressed_value = i;

From c1df2f498808258f13bc284ea27491cbcd41067d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Feb 2018 22:13:46 -0500
Subject: [PATCH 134/184] [build] Simplify INSTALL instructions

---
 src/INSTALL | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/src/INSTALL b/src/INSTALL
index f40a514c4b6..d794cab67ee 100644
--- a/src/INSTALL
+++ b/src/INSTALL
@@ -9,21 +9,12 @@ You must first have completed the installation steps in ../tools/INSTALL
 The installation instructions are
 
   ./configure --shared
-  make depend
-  make
-
-Note that "make" takes a long time. You can speed it up by running make
-in parallel if you have multiple CPUs, e.g. to use 8 CPUs
-
   make depend -j 8
   make -j 8
 
-Kaldi requires a relatively recent C++ compiler with C++11 support,
-e.g. g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3. If your system
-default compiler does not support C++11, you can specify a C++11 compliant
-compiler by setting the CXX environment variable, e.g.
-
-  CXX=g++-4.8 ./configure --shared
+Note that we added the "-j 8" to run in parallel because "make" takes a long
+time.  8 jobs might be too many for a laptop or small desktop machine with not
+many cores.
 
 For more information, see documentation at http://kaldi-asr.org/doc/
 and click on "The build process (how Kaldi is compiled)".

From be969d7baf04d4dce35298a2681b9a9b34ce6aba Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 17 Feb 2018 01:37:26 -0500
Subject: [PATCH 135/184] [build] Upgrading version of Kaldi to 5.4

---
 src/.version | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/.version b/src/.version
index d346e2ab7f2..37c2d9960ec 100644
--- a/src/.version
+++ b/src/.version
@@ -1 +1 @@
-5.3
+5.4

From 6016c004b117d03cf6368f0a8f506a554da63959 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 17 Feb 2018 01:39:12 -0500
Subject: [PATCH 136/184] [doc] Add documentation for 5.4 version of Kaldi

---
 src/doc/get_version_info.sh |  5 +++--
 src/doc/versions.dox        | 22 +++++++++++++++++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/src/doc/get_version_info.sh b/src/doc/get_version_info.sh
index 422798905f5..c11fb7f805e 100755
--- a/src/doc/get_version_info.sh
+++ b/src/doc/get_version_info.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# search for VERSIONS below to see how to change this when
+# search for VERSION below to see how to change this when
 # Kaldi's version number increases.
 
 # Note: this script assumes that it's part of a git repository where
@@ -42,7 +42,8 @@ fi
 # Note: when you add new tuples here you'll also want to add ndew
 # \htmlinclude directives in versions.dox.
 
-for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 master 131cdd4cb544"; do
+for tuple in "5.0 5.0 c160a9883" "5.1 5.1 2145519961" "5.2 5.2 393ef73caa93" "5.3 5.3 131cdd4cb544" \
+              "5.4 master be969d7baf04"; do
   major_minor_number=$(echo $tuple | awk '{print $1}')  # e.g. 5.0
   branch=$(echo $tuple | awk '{print $2}')  # e.g. 'master', or '5.1' (it's a branch name)
   first_commit=$(echo $tuple | awk '{print $3}')
diff --git a/src/doc/versions.dox b/src/doc/versions.dox
index 9461ef1e873..d12b8621ccd 100644
--- a/src/doc/versions.dox
+++ b/src/doc/versions.dox
@@ -116,7 +116,7 @@
 
  \subsection versions_versions_53 Version 5.3
 
-   Version 5.3 is the current master branch.  Major changes that were made between the end of 5.2.x
+   Major changes that were made between the end of 5.2.x
    and the start of the 5.3 branch include:
       - Create a nnet3-based setup for RNN language models (i.e. recurrent and neural net based
         language models)
@@ -127,4 +127,24 @@
 
    \htmlinclude 5.3.html
 
+ \subsection versions_versions_54 Version 5.4
+
+
+   Version 5.4 is the current master branch.   The main changes that were made between
+   the end of 5.3.x and the start of the 5.4 branch include:
+    - Some code changes in the nnet3 codebase, for speed and memory efficiency.
+    - Various simplifications and code reorganizations in the nnet3 code.
+    - Support for a new kind of factorized TDNN which gives substantially better
+      results than our old TDNN recipe, and is even better than our old TDNN+LSTM
+      recipe.  A good example of this is in egs/swbd/s5c/local/chain/tuning/run_tdnn_lstm_1n.sh.
+      Some nnet3 code changes were needed for this as well (mostly: support for constraining
+      a matrix to have orthonormal rows).
+
+   Below are patches corresponding to minor version numbers 5.4.x.
+
+   \htmlinclude 5.4.html
+
+
+
+
 */

From 9b282b58c2d2287618277a82e376c9663b196ab0 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 18 Feb 2018 17:09:01 -0500
Subject: [PATCH 137/184] [scripts] Fix bug in
 steps/diagnostic/analyze_phone_length_stats.py [thanks: Vinay Sunder.]

---
 .../s5/steps/diagnostic/analyze_phone_length_stats.py  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
index c186c3f6cf0..c74a79372d6 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
+++ b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
@@ -64,11 +64,10 @@
 
 
 # phone_length is a dict of dicts of dicts;
-# it's indexed
-# phone_length[boundary_type][
 # phone_lengths[boundary_type] for boundary_type in [ 'begin', 'end', 'all' ] is
-# a dict from a 2-tuple (phone, length) to a count of occurrences, where phone is
-# an integer phone-id, and length is the length of the phone instance in frames.
+# a dict indexed by phone, containing dicts from length to a count of occurrences.
+# Phones are ints and lengths are integers representing numbers of frames.
+# So: count == phone_lengths[boundary_type][phone][length].
 # note: for the 'begin' and 'end' boundary-types, we group all nonsilence phones
 # into phone-id zero.
 phone_lengths = dict()
@@ -229,11 +228,14 @@ def GetMean(length_to_count):
 total_phones['internal'] = total_phones['all'] - total_phones['begin'] - total_phones['end']
 
 internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone])
+# internal_opt_sil_phone_lenghts is a dict from length to count.
 for length in internal_opt_sil_phone_lengths.keys():
     # subtract the counts for begin and end from the overall counts to get the
     # word-internal count.
     internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] +
                                                phone_lengths['end'][optional_silence_phone][length])
+    if internal_opt_sil_phone_lengths[length] == 0:
+        del internal_opt_sil_phone_lengths[length]
 
 if total_phones['internal'] != 0.0:
     total_internal_optsil_frames = sum([ float(l * c) for l,c in internal_opt_sil_phone_lengths.items() ])

From 09b017669124cb55bbd954e94bbc48cc9320e240 Mon Sep 17 00:00:00 2001
From: Robert Cobain <robertjcobain@gmail.com>
Date: Sun, 18 Feb 2018 22:47:23 +0000
Subject: [PATCH 138/184] [build] Update gst-plugin Makefile (#2226)

---
 egs/voxforge/gst_demo/run-simulated.sh | 4 ++--
 src/doc/online_programs.dox            | 2 +-
 src/gst-plugin/Makefile                | 2 +-
 src/gst-plugin/README                  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/egs/voxforge/gst_demo/run-simulated.sh b/egs/voxforge/gst_demo/run-simulated.sh
index b5fd22305d0..5e93c70dec4 100755
--- a/egs/voxforge/gst_demo/run-simulated.sh
+++ b/egs/voxforge/gst_demo/run-simulated.sh
@@ -26,8 +26,8 @@ ac_model=${data_file}/models/$ac_model_type
 trans_matrix=""
 audio=${data_file}/audio
 
-if [ ! -s $KALDI_ROOT/src/gst-plugin/libgstkaldi.so ]; then
-    echo "Kaldi Gstreamer plugin libarary $KALDI_ROOT/src/gst-plugin/libgstkaldi.so not present, make it first"
+if [ ! -s $KALDI_ROOT/src/gst-plugin/libgstonlinegmmdecodefaster.so ]; then
+    echo "Kaldi Gstreamer plugin libarary $KALDI_ROOT/src/gst-plugin/libgstonlinegmmdecodefaster.so not present, make it first"
     exit 1
 fi
 
diff --git a/src/doc/online_programs.dox b/src/doc/online_programs.dox
index d3086eec8bb..eeb6136448a 100644
--- a/src/doc/online_programs.dox
+++ b/src/doc/online_programs.dox
@@ -163,7 +163,7 @@ repository (http://backports.debian.org). Install the 'good' GStreamer plugins (
 and GStreamer 1.0 tools (package `gstreamer1.0-tools`). A demo program also requires the PulseAudio Gstreamer plugins 
 (package gstreamer1.0-pulseaudio).
 
-Finally, run `make depend` and `make` in the `src/gst-plugin` directory. This should result in a file `src/gst-plugin/libgstkaldi.so`
+Finally, run `make depend` and `make` in the `src/gst-plugin` directory. This should result in a file `src/gst-plugin/libgstonlinegmmdecodefaster.so`
 which contains the GStreamer plugin.
 
 To make GStreamer able to find the Kaldi plugin, you have to add the `src/gst-plugin` directory to its plugin search path. To do this,
diff --git a/src/gst-plugin/Makefile b/src/gst-plugin/Makefile
index a9b3a208ff1..92af0483a6e 100644
--- a/src/gst-plugin/Makefile
+++ b/src/gst-plugin/Makefile
@@ -22,7 +22,7 @@ EXTRA_LDLIBS += -lkaldi-online -lkaldi-lat -lkaldi-decoder -lkaldi-feat -lkaldi-
 
 OBJFILES = gst-audio-source.o gst-online-gmm-decode-faster.o
 
-LIBNAME=gstkaldi
+LIBNAME=gstonlinegmmdecodefaster
 
 LIBFILE = lib$(LIBNAME).so
 BINFILES= $(LIBFILE)
diff --git a/src/gst-plugin/README b/src/gst-plugin/README
index 38aea4a54a6..4cbc443fe6a 100644
--- a/src/gst-plugin/README
+++ b/src/gst-plugin/README
@@ -31,7 +31,7 @@ Now, run:
 make depend
 make
 
-This should result in libgstkaldi.so which contains the GStreamer plugin
+This should result in libgstonlinegmmdecodefaster.so which contains the GStreamer plugin
 
 == Usage ==
 

From b61d76049cedb5ef190b0d52e80aaa5be11cb2f7 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 20 Feb 2018 20:52:02 -0500
Subject: [PATCH 139/184] [src] Fix error in how Compress() command is printed
 for debugging

---
 src/nnet3/nnet-computation.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index a9a21bb3f24..98aed592a62 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -637,7 +637,7 @@ static void PrintCommand(std::ostream &os_out,
         compressed_matrix_type = "uint16";
       }
       os << "CompressMatrix(" << submatrix_strings[c.arg1]
-         << range << ", " << compressed_matrix_type << ", "
+         << ", " << range << ", " << compressed_matrix_type << ", "
          << truncate << ")\n";
       break;
     }

From 6dd9c3e3fee25ab251a37b68d4c7c1bbb1785c15 Mon Sep 17 00:00:00 2001
From: Ewald Enzinger <entn-at@users.noreply.github.com>
Date: Wed, 21 Feb 2018 12:38:54 -0800
Subject: [PATCH 140/184] [scripts] Fix bug in
 steps/libs/nnet3/train/frame_level_objf/common.py (#2231)

---
 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index 9dd12e63f52..cef0bb66c45 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -9,7 +9,7 @@
 network without transition model) with frame-level objectives.
 """
 
-from __future__ import print_statement
+from __future__ import print_function
 from __future__ import division
 import glob
 import logging

From e3df6ac0ce01591e59ee367e446ef0ddfca9dd75 Mon Sep 17 00:00:00 2001
From: Ewald Enzinger <entn-at@users.noreply.github.com>
Date: Wed, 21 Feb 2018 14:18:55 -0800
Subject: [PATCH 141/184] [src] Print curand return codes (#2232)

---
 src/cudamatrix/cu-common.cc | 20 ++++++++++++++++++++
 src/cudamatrix/cu-common.h  | 12 ++++++++++++
 src/cudamatrix/cu-rand.cc   | 16 ++++++++--------
 src/cudamatrix/cu-rand.h    | 14 +++++++-------
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/src/cudamatrix/cu-common.cc b/src/cudamatrix/cu-common.cc
index 109a9ae4e3a..c788a621a85 100644
--- a/src/cudamatrix/cu-common.cc
+++ b/src/cudamatrix/cu-common.cc
@@ -102,6 +102,26 @@ const char* cusparseGetStatusString(cusparseStatus_t status) {
   }
   return "CUSPARSE_STATUS_UNKNOWN_ERROR";
 }
+
+const char* curandGetStatusString(curandStatus_t status) {
+  // detail info come from http://docs.nvidia.com/cuda/curand/group__HOST.html
+  switch(status) {
+    case CURAND_STATUS_SUCCESS:                     return "CURAND_STATUS_SUCCESS";
+    case CURAND_STATUS_VERSION_MISMATCH:            return "CURAND_STATUS_VERSION_MISMATCH";
+    case CURAND_STATUS_NOT_INITIALIZED:             return "CURAND_STATUS_NOT_INITIALIZED";
+    case CURAND_STATUS_ALLOCATION_FAILED:           return "CURAND_STATUS_ALLOCATION_FAILED";
+    case CURAND_STATUS_TYPE_ERROR:                  return "CURAND_STATUS_TYPE_ERROR";
+    case CURAND_STATUS_OUT_OF_RANGE:                return "CURAND_STATUS_OUT_OF_RANGE";
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:         return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:   return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+    case CURAND_STATUS_LAUNCH_FAILURE:              return "CURAND_STATUS_LAUNCH_FAILURE";
+    case CURAND_STATUS_PREEXISTING_FAILURE:         return "CURAND_STATUS_PREEXISTING_FAILURE";
+    case CURAND_STATUS_INITIALIZATION_FAILED:       return "CURAND_STATUS_INITIALIZATION_FAILED";
+    case CURAND_STATUS_ARCH_MISMATCH:               return "CURAND_STATUS_ARCH_MISMATCH";
+    case CURAND_STATUS_INTERNAL_ERROR:              return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+  return "CURAND_STATUS_UNKNOWN_ERROR";
+}
 #endif
 
 } // namespace
diff --git a/src/cudamatrix/cu-common.h b/src/cudamatrix/cu-common.h
index 03330150d8c..7446a76bf93 100644
--- a/src/cudamatrix/cu-common.h
+++ b/src/cudamatrix/cu-common.h
@@ -32,6 +32,7 @@
 #if HAVE_CUDA == 1
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <curand.h>
 #include <cuda_runtime_api.h>
 
 #define CU_SAFE_CALL(fun) \
@@ -58,6 +59,14 @@
   } \
 }
 
+#define CURAND_SAFE_CALL(fun) \
+{ \
+  int32 ret; \
+  if ((ret = (fun)) != 0) { \
+    KALDI_ERR << "curandStatus_t " << ret << " : \"" << curandGetStatusString((curandStatus_t)ret) << "\" returned from '" << #fun << "'"; \
+  } \
+}
+
 #define KALDI_CUDA_ERR(ret, msg) \
 { \
   if (ret != 0) { \
@@ -95,6 +104,9 @@ const char* cublasGetStatusString(cublasStatus_t status);
 
 /** This is analogous to the CUDA function cudaGetErrorString(). **/
 const char* cusparseGetStatusString(cusparseStatus_t status);
+
+/** This is analogous to the CUDA function cudaGetErrorString(). **/
+const char* curandGetStatusString(curandStatus_t status);
 }
 
 #endif // HAVE_CUDA
diff --git a/src/cudamatrix/cu-rand.cc b/src/cudamatrix/cu-rand.cc
index 556fe12c32b..9d55a3f655e 100644
--- a/src/cudamatrix/cu-rand.cc
+++ b/src/cudamatrix/cu-rand.cc
@@ -68,7 +68,7 @@ void CuRand<Real>::RandUniform(CuMatrixBase<Real> *tgt) {
     // may vary).
     CuMatrix<Real> tmp(tgt->NumRows(), tgt->NumCols(), kUndefined,
                        kStrideEqualNumCols);
-    CU_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), tmp.NumRows() * tmp.Stride()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tmp.Data(), tmp.NumRows() * tmp.Stride()));
     tgt->CopyFromMat(tmp);
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -84,7 +84,7 @@ void CuRand<Real>::RandUniform(CuMatrix<Real> *tgt) {
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
     // Here we don't need to use 'tmp' matrix,
-    CU_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->NumRows() * tgt->Stride()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->NumRows() * tgt->Stride()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -98,7 +98,7 @@ void CuRand<Real>::RandUniform(CuVectorBase<Real> *tgt) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    CU_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->Dim()));
+    CURAND_SAFE_CALL(curandGenerateUniformWrap(gen_, tgt->Data(), tgt->Dim()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
 #endif
@@ -123,7 +123,7 @@ void CuRand<Real>::RandGaussian(CuMatrixBase<Real> *tgt) {
     MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
     CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                        kStrideEqualNumCols);
-    CU_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.NumRows()*tmp.Stride()));
+    CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.NumRows()*tmp.Stride()));
     tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -141,7 +141,7 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
     // Here we don't need to use 'tmp' matrix, if the number of elements is even,
     MatrixIndexT num_elements = tgt->NumRows() * tgt->Stride();
     if (0 == (num_elements % 2)) {
-      CU_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), num_elements));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), num_elements));
     } else {
       // We use 'tmp' matrix with one column added, this guarantees an even
       // number of elements.  Use the option kStrideEqualNumCols to ensure
@@ -150,7 +150,7 @@ void CuRand<Real>::RandGaussian(CuMatrix<Real> *tgt) {
       MatrixIndexT num_cols_even = tgt->NumCols() + (tgt->NumCols() % 2); // + 0 or 1,
       CuMatrix<Real> tmp(tgt->NumRows(), num_cols_even, kUndefined,
                          kStrideEqualNumCols);
-      CU_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(),
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(),
                                             tmp.NumRows() * tmp.Stride()));
       tgt->CopyFromMat(tmp.ColRange(0,tgt->NumCols()));
     }
@@ -172,11 +172,11 @@ void CuRand<Real>::RandGaussian(CuVectorBase<Real> *tgt) {
     // curandGenerateUniform(), curandGenerateUniformDouble().
     MatrixIndexT num_elements = tgt->Dim();
     if (0 == (num_elements % 2)) {
-      CU_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), tgt->Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tgt->Data(), tgt->Dim()));
     } else {
       MatrixIndexT dim_even = tgt->Dim() + (tgt->Dim() % 2); // + 0 or 1,
       CuVector<Real> tmp(dim_even, kUndefined);
-      CU_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.Dim()));
+      CURAND_SAFE_CALL(curandGenerateNormalWrap(gen_, tmp.Data(), tmp.Dim()));
       tgt->CopyFromVec(tmp.Range(0,tgt->Dim()));
     }
     CuDevice::Instantiate().AccuProfile(__func__, tim);
diff --git a/src/cudamatrix/cu-rand.h b/src/cudamatrix/cu-rand.h
index d999d6707ee..fafc747df8d 100644
--- a/src/cudamatrix/cu-rand.h
+++ b/src/cudamatrix/cu-rand.h
@@ -37,11 +37,11 @@ class CuRand {
   #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       // Initialize the generator,
-      CU_SAFE_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
+      CURAND_SAFE_CALL(curandCreateGenerator(&gen_, CURAND_RNG_PSEUDO_DEFAULT));
       // To get same random sequence, call srand() before the constructor is invoked,
-      CU_SAFE_CALL(curandSetGeneratorOrdering(gen_, CURAND_ORDERING_PSEUDO_DEFAULT));
-      CU_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CU_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
+      CURAND_SAFE_CALL(curandSetGeneratorOrdering(gen_, CURAND_ORDERING_PSEUDO_DEFAULT));
+      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
+      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
     }
   #endif
   }
@@ -50,7 +50,7 @@ class CuRand {
   #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       // Release the generator,
-      CU_SAFE_CALL(curandDestroyGenerator(gen_));
+      CURAND_SAFE_CALL(curandDestroyGenerator(gen_));
     }
   #endif
   }
@@ -60,8 +60,8 @@ class CuRand {
   #if HAVE_CUDA == 1
     if (CuDevice::Instantiate().Enabled()) {
       // To get same random sequence, call srand() before the method is invoked,
-      CU_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
-      CU_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
+      CURAND_SAFE_CALL(curandSetPseudoRandomGeneratorSeed(gen_, RandInt(128, RAND_MAX)));
+      CURAND_SAFE_CALL(curandSetGeneratorOffset(gen_, 0));
     }
   #endif
   }

From 9191f296dab625b9afbb16d5e37ba6371de022cf Mon Sep 17 00:00:00 2001
From: mpuels <mpuels@gmail.com>
Date: Fri, 23 Feb 2018 20:12:50 +0100
Subject: [PATCH 142/184] [egs] Fix script bug (in error-checking statement)
 (#2239)

---
 egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh b/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh
index 06c9dbeeffa..91e90710af2 100755
--- a/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/sprakbanken/s5/local/nnet3/run_ivector_common.sh
@@ -105,7 +105,7 @@ if [ $stage -le 4 ]; then
   # note: essentially all the original segments should be in the hires data.
   n1=$(wc -l <data/${train_set}/feats.scp)
   n2=$(wc -l <$temp_data_root/${train_set}_hires/feats.scp)
-  if [ $n1 != $n1 ]; then
+  if [ $n1 != $n2 ]; then
     echo "$0: warning: number of feats $n1 != $n2, if these are very different it could be bad."
   fi
 

From e0d8f13e80b8effe161c3da025ac71f52bed913e Mon Sep 17 00:00:00 2001
From: mpuels <mpuels@gmail.com>
Date: Fri, 23 Feb 2018 21:39:27 +0100
Subject: [PATCH 143/184] [scripts] Fix typo (#2237)

---
 .../s5/steps/online/nnet2/prepare_online_decoding_transfer.sh   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh b/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh
index 82c5b463b60..c44f18db3f1 100755
--- a/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh
+++ b/egs/wsj/s5/steps/online/nnet2/prepare_online_decoding_transfer.sh
@@ -8,7 +8,7 @@
 # stuff, that you don't want to change, but 
 
 # Begin configuration.
-stage=0 # This allows restarting after partway, when something when wrong.
+stage=0 # This allows restarting after partway, when something went wrong.
 cmd=run.pl
 iter=final
 # End configuration.

From 4f27915e0e0d643ace40e64eba91dde2556726c9 Mon Sep 17 00:00:00 2001
From: mpuels <mpuels@gmail.com>
Date: Fri, 23 Feb 2018 22:08:08 +0100
Subject: [PATCH 144/184] [scripts] Fix typo in comment (#2238)

---
 egs/wsj/s5/steps/train_lda_mllt.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/train_lda_mllt.sh b/egs/wsj/s5/steps/train_lda_mllt.sh
index 363df34a3cd..a1828aa6fcb 100755
--- a/egs/wsj/s5/steps/train_lda_mllt.sh
+++ b/egs/wsj/s5/steps/train_lda_mllt.sh
@@ -5,7 +5,7 @@
 # LDA+MLLT refers to the way we transform the features after computing
 # the MFCCs: we splice across several frames, reduce the dimension (to 40
 # by default) using Linear Discriminant Analysis), and then later estimate,
-# over multiple iterations, a diagonalizing transform known as MLLT or CTC.
+# over multiple iterations, a diagonalizing transform known as MLLT or STC.
 # See http://kaldi-asr.org/doc/transform.html for more explanation.
 #
 # Apache 2.0.

From f629bf89a7fba496f2431fb5b9c3295883589d88 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 24 Feb 2018 11:58:55 -0500
Subject: [PATCH 145/184] [scripts] Bug-fix in analyze_phone_length_stats.py
 (thanks: @stanleyguan)

---
 egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
index c74a79372d6..5ebd9e7369b 100755
--- a/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
+++ b/egs/wsj/s5/steps/diagnostic/analyze_phone_length_stats.py
@@ -229,7 +229,7 @@ def GetMean(length_to_count):
 
 internal_opt_sil_phone_lengths = dict(phone_lengths['all'][optional_silence_phone])
 # internal_opt_sil_phone_lenghts is a dict from length to count.
-for length in internal_opt_sil_phone_lengths.keys():
+for length in list(internal_opt_sil_phone_lengths.keys()):
     # subtract the counts for begin and end from the overall counts to get the
     # word-internal count.
     internal_opt_sil_phone_lengths[length] -= (phone_lengths['begin'][optional_silence_phone][length] +

From 851c5e4c6b145d28bddf7a2cb950e0bd6d9c138d Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 27 Feb 2018 13:21:56 -0500
Subject: [PATCH 146/184] [egs] Fix to swbd chain tdnn_blstm script bug
 (thanks: sameer khurana)

---
 egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
index 40b95f862ea..4894e492542 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_blstm_1d.sh
@@ -33,7 +33,6 @@ self_repair_scale=0.00001
 label_delay=0
 dropout_schedule='0,0@0.20,0.1@0.50,0'
 # decode options
-frames_per_chunk=
 remove_egs=false
 common_egs_dir=
 
@@ -216,6 +215,7 @@ if [ $stage -le 15 ]; then
   if [ ! -z $decode_iter ]; then
     iter_opts=" --iter $decode_iter "
   fi
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   for decode_set in train_dev eval2000; do
       (
       steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \

From 7e902f535cf58f4ffe98cb9298c3867fe084fecf Mon Sep 17 00:00:00 2001
From: Ewald Enzinger <entn-at@users.noreply.github.com>
Date: Tue, 27 Feb 2018 10:46:50 -0800
Subject: [PATCH 147/184] [scripts] Fix for nnet3 model combination on CPU when
 --use-gpu false (#2240)

---
 egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py | 3 ++-
 egs/wsj/s5/steps/libs/nnet3/train/common.py                    | 1 +
 egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py   | 3 ++-
 egs/wsj/s5/steps/nnet3/chain/train.py                          | 2 ++
 egs/wsj/s5/steps/nnet3/train_dnn.py                            | 2 ++
 egs/wsj/s5/steps/nnet3/train_raw_dnn.py                        | 2 ++
 egs/wsj/s5/steps/nnet3/train_raw_rnn.py                        | 2 ++
 egs/wsj/s5/steps/nnet3/train_rnn.py                            | 2 ++
 8 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
index 3df2720b2c0..5ae7aecd36c 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py
@@ -549,7 +549,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 nnet3-chain-combine \
                 --max-objective-evaluations={max_objective_evaluations} \
                 --l2-regularize={l2} --leaky-hmm-coefficient={leaky} \
-                --verbose=3 {dir}/den.fst {raw_models} \
+                --verbose=3 {combine_gpu_opt} {dir}/den.fst {raw_models} \
                 "ark,bg:nnet3-chain-copy-egs ark:{egs_dir}/combine.cegs ark:- | \
                     nnet3-chain-merge-egs --minibatch-size={num_chunk_per_mb} \
                     ark:- ark:- |" - \| \
@@ -557,6 +557,7 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st
                 {dir}/final.mdl""".format(
                     command=run_opts.command,
                     combine_queue_opt=run_opts.combine_queue_opt,
+                    combine_gpu_opt=run_opts.combine_gpu_opt,
                     max_objective_evaluations=max_objective_evaluations,
                     l2=l2_regularize, leaky=leaky_hmm_coefficient,
                     dir=dir, raw_models=" ".join(raw_model_strings),
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py
index 443834fc161..d946947b111 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py
@@ -35,6 +35,7 @@ class RunOpts(object):
     def __init__(self):
         self.command = None
         self.train_queue_opt = None
+        self.combine_gpu_opt = None
         self.combine_queue_opt = None
         self.prior_gpu_opt = None
         self.prior_queue_opt = None
diff --git a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
index cef0bb66c45..6b572acb5d7 100644
--- a/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
+++ b/egs/wsj/s5/steps/libs/nnet3/train/frame_level_objf/common.py
@@ -527,7 +527,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                              use_multitask_egs=use_multitask_egs)
     common_lib.execute_command(
         """{command} {combine_queue_opt} {dir}/log/combine.log \
-                nnet3-combine \
+                nnet3-combine {combine_gpu_opt} \
                 --max-objective-evaluations={max_objective_evaluations} \
                 --verbose=3 {raw_models} \
                 "ark,bg:nnet3-copy-egs {multitask_egs_opts} \
@@ -536,6 +536,7 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir,
                 "{out_model}"
         """.format(command=run_opts.command,
                    combine_queue_opt=run_opts.combine_queue_opt,
+                   combine_gpu_opt=run_opts.combine_gpu_opt,
                    dir=dir, raw_models=" ".join(raw_model_strings),
                    max_objective_evaluations=max_objective_evaluations,
                    egs_rspecifier=egs_rspecifier,
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 144d29641fd..87edd661a6f 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -238,6 +238,7 @@ def process_args(args):
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
         run_opts.combine_queue_opt = "--gpu 1"
+        run_opts.combine_gpu_opt = ""
 
     else:
         logger.warning("Without using a GPU this will be very slow. "
@@ -246,6 +247,7 @@ def process_args(args):
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
+        run_opts.combine_gpu_opt = "--use-gpu=no"
 
     run_opts.command = args.command
     run_opts.egs_command = (args.egs_command
diff --git a/egs/wsj/s5/steps/nnet3/train_dnn.py b/egs/wsj/s5/steps/nnet3/train_dnn.py
index e21fdb9f43e..2cb314cca61 100755
--- a/egs/wsj/s5/steps/nnet3/train_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_dnn.py
@@ -128,6 +128,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
+        run_opts.combine_gpu_opt = ""
         run_opts.combine_queue_opt = "--gpu 1"
         run_opts.prior_gpu_opt = "--use-gpu=yes"
         run_opts.prior_queue_opt = "--gpu 1"
@@ -137,6 +138,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_gpu_opt = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
index d5b37871d70..14922247cd3 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_dnn.py
@@ -136,6 +136,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
+        run_opts.combine_gpu_opt = ""
         run_opts.combine_queue_opt = "--gpu 1"
         run_opts.prior_gpu_opt = "--use-gpu=yes"
         run_opts.prior_queue_opt = "--gpu 1"
@@ -146,6 +147,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_gpu_opt = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
diff --git a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
index 686b76aa7db..4623756caba 100755
--- a/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_raw_rnn.py
@@ -189,6 +189,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
+        run_opts.combine_gpu_opt = ""
         run_opts.combine_queue_opt = "--gpu 1"
         run_opts.prior_gpu_opt = "--use-gpu=yes"
         run_opts.prior_queue_opt = "--gpu 1"
@@ -199,6 +200,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_gpu_opt = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""
diff --git a/egs/wsj/s5/steps/nnet3/train_rnn.py b/egs/wsj/s5/steps/nnet3/train_rnn.py
index 1d2135c90c2..fd74e5c9f44 100755
--- a/egs/wsj/s5/steps/nnet3/train_rnn.py
+++ b/egs/wsj/s5/steps/nnet3/train_rnn.py
@@ -183,6 +183,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = "--gpu 1"
         run_opts.parallel_train_opts = ""
+        run_opts.combine_gpu_opt = ""
         run_opts.combine_queue_opt = "--gpu 1"
         run_opts.prior_gpu_opt = "--use-gpu=yes"
         run_opts.prior_queue_opt = "--gpu 1"
@@ -193,6 +194,7 @@ def process_args(args):
 
         run_opts.train_queue_opt = ""
         run_opts.parallel_train_opts = "--use-gpu=no"
+        run_opts.combine_gpu_opt = "--use-gpu=no"
         run_opts.combine_queue_opt = ""
         run_opts.prior_gpu_opt = "--use-gpu=no"
         run_opts.prior_queue_opt = ""

From 27f6c5ed1cca164b23e2fdae1f63afe3ee670f24 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 28 Feb 2018 13:39:49 -0500
Subject: [PATCH 148/184] [src] Fix small bug in cu-math.cc affecting non-GPU
 LSTM training, thx: Mael Primet.

---
 src/cudamatrix/cu-math.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/cudamatrix/cu-math.cc b/src/cudamatrix/cu-math.cc
index 5dcab41f067..3fbeff3a470 100644
--- a/src/cudamatrix/cu-math.cc
+++ b/src/cudamatrix/cu-math.cc
@@ -564,7 +564,6 @@ void CpuBackpropLstmNonlinearity(const MatrixBase<Real> &input,
   KALDI_ASSERT(deriv_sum_in.NumRows() == 5);
   KALDI_ASSERT(deriv_sum_in.NumCols() == cell_dim);
   KALDI_ASSERT(self_repair_config.Dim() == 10);
-  KALDI_ASSERT(count_in >= 0);
   if (input_deriv != NULL) {
     KALDI_ASSERT(SameDim(input, *input_deriv));
   }

From 0ebe5b59ebd73d762f8eb00f6d496fab6164f643 Mon Sep 17 00:00:00 2001
From: Vimal Manohar <vimal.manohar91@gmail.com>
Date: Wed, 28 Feb 2018 18:29:53 -0500
Subject: [PATCH 149/184] [egs] Simplify Aspire scripts and make it work with
 LDC release. (#2230)

---
 egs/aspire/s5/RESULTS                         |  16 +
 .../s5/local/chain/compare_wer_general.sh     |  53 +++
 egs/aspire/s5/local/chain/run_tdnn_lstm.sh    |   1 +
 .../s5/local/chain/tuning/run_blstm_7b.sh     |   4 +-
 .../s5/local/chain/tuning/run_tdnn_7b.sh      |   6 +-
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 312 ++++++++++++++++++
 .../generate_uniformly_segmented_data_dir.sh  |  63 ++--
 .../local/multi_condition/aspire_data_prep.sh |  32 +-
 .../local/multi_condition/run_nnet2_common.sh |   3 -
 .../nnet3/{prep_test_aspire.sh => decode.sh}  |  49 +--
 ...test_aspire_online.sh => decode_online.sh} |  46 +--
 .../s5/local/nnet3/run_ivector_common.sh      |  43 +--
 .../s5/local/nnet3/segment_and_decode.sh      | 165 +++++++++
 egs/aspire/s5/local/run_asr_segmentation.sh   | 234 +++++++++++++
 .../tuning/train_lstm_asr_sad_1a.sh           | 138 ++++++++
 .../tuning/train_stats_asr_sad_1a.sh          | 136 ++++++++
 egs/aspire/s5/run.sh                          |  85 +++--
 egs/swbd/s5c/local/run_asr_segmentation.sh    |  14 +-
 .../convert_utt2spk_and_segments_to_rttm.py   |   2 +-
 .../internal/get_transform_probs_mat.py       |   7 +-
 .../internal/prepare_sad_graph.py             |   3 +-
 21 files changed, 1241 insertions(+), 171 deletions(-)
 create mode 100755 egs/aspire/s5/local/chain/compare_wer_general.sh
 create mode 120000 egs/aspire/s5/local/chain/run_tdnn_lstm.sh
 create mode 100755 egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
 rename egs/aspire/s5/local/nnet3/{prep_test_aspire.sh => decode.sh} (85%)
 rename egs/aspire/s5/local/nnet3/{prep_test_aspire_online.sh => decode_online.sh} (74%)
 create mode 100755 egs/aspire/s5/local/nnet3/segment_and_decode.sh
 create mode 100755 egs/aspire/s5/local/run_asr_segmentation.sh
 create mode 100755 egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
 create mode 100755 egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh

diff --git a/egs/aspire/s5/RESULTS b/egs/aspire/s5/RESULTS
index 3310529a338..684d6928643 100755
--- a/egs/aspire/s5/RESULTS
+++ b/egs/aspire/s5/RESULTS
@@ -52,3 +52,19 @@ for x in exp/*/decode_dev; do grep WER $x/wer_* | utils/best_wer.sh; done
 # local/chain/run_blstm_7b.sh
 %WER 24.6 | 2120 27224 | 82.0 11.5 6.5 6.7 24.6 74.6 | -0.976 | exp/chain/blstm_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_8/penalty_0.75/ctm.filt.filt.sys
 
+# local/chain/run_tdnn_lstm_1a.sh
+%WER 23.6 | 2120 27219 | 82.8 11.5 5.7 6.5 23.6 73.8 | -0.675 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_whole_uniformsegmented_v9_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys
+
+# The following results use the new ASpIRE dev sets from LDC.
+# This uses a new GLM with more acronym normalizations.
+# The new STM has more IGNORE_TIME_SEGMENT markings and acronym normalizations.
+%WER 22.9 | 2083 25834 | 81.6 12.0 6.4 4.5 22.9 70.7 | -0.546 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_uniformsegmented_v9_pp_fg/score_8/penalty_0.0/ctm.filt.filt.sys
+%WER 24.0 | 2083 25820 | 79.9 12.0 8.1 4.0 24.0 71.8 | -0.444 | exp/chain/tdnn_lstm_1a_online/decode_dev_aspire_uniformsegmented_v9_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys
+
+# Result using oracle segmentation
+# There is no specific script for this. But you can run the decoding and lm rescoring using the
+# default parameters in local/nnet3/decode.sh.
+%WER 23.0 | 2083 25830 | 81.5 12.1 6.4 4.5 23.0 70.4 | -0.522 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_fg/score_8/penalty_0.0/ctm.filt.filt.sys
+
+# Result using speech activity detection for segmentation
+%WER 22.9 | 2083 25821 | 81.9 11.1 7.0 4.9 22.9 71.8 | -0.488 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_asr_sad_1a_pp_fg/score_10/penalty_0.25/ctm.filt.filt.sys
diff --git a/egs/aspire/s5/local/chain/compare_wer_general.sh b/egs/aspire/s5/local/chain/compare_wer_general.sh
new file mode 100755
index 00000000000..7b85dc373e0
--- /dev/null
+++ b/egs/aspire/s5/local/chain/compare_wer_general.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer_general.sh exp/chain/tdnn_7b exp/chain/tdnn_lstm_1a
+
+echo "# $0 $*";  # print command line.
+
+echo -n "# System                     "
+for x in $*; do   printf " % 20s" $x;   done
+echo
+
+
+echo -n "# WER on dev_aspire (fg)     "
+for dirname in $*; do
+  wer=$(grep -H Sum $dirname/decode*dev_aspire*fg/score*/*/ctm*.sys | utils/best_wer.sh | awk '{print $2}')
+  printf "% 19s" $wer
+done
+echo
+
+echo -n "# Final train prob           "
+for dirname in $*; do
+  prob=$(grep Overall $dirname/log/compute_prob_train.final.log | grep -v xent | awk '{print $8}')
+  printf "% 19.3f" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for dirname in $*; do
+  prob=$(grep Overall $dirname/log/compute_prob_valid.final.log | grep -v xent | awk '{print $8}')
+  printf "% 19.3f" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for dirname in $*; do
+  prob=$(grep Overall $dirname/log/compute_prob_train.final.log | grep -w xent | awk '{print $8}')
+  printf "% 19.3f" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for dirname in $*; do
+  prob=$(grep Overall $dirname/log/compute_prob_valid.final.log | grep -w xent | awk '{print $8}')
+  printf "% 19.4f" $prob
+done
+echo
+
+echo -n "# Num-parameters             "
+for dirname in $*; do
+  num_params=$(grep num-parameters $dirname/log/progress.1.log | awk '{print $2}')
+  printf "% 19d" $num_params
+done
+echo
diff --git a/egs/aspire/s5/local/chain/run_tdnn_lstm.sh b/egs/aspire/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/aspire/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
index 1613b27d1c0..8ff59d83ed0 100755
--- a/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_blstm_7b.sh
@@ -5,7 +5,6 @@ set -e
 # based on run_blstm_6h.sh in fisher_swbd recipe
 
 # configs for 'chain'
-affix=
 stage=11 # assuming you already ran the xent systems
 train_stage=-10
 get_egs_stage=-10
@@ -39,7 +38,6 @@ where "nvcc" is installed.
 EOF
 fi
 
-dir=${dir}${affix:+_$affix}
 ali_dir=exp/tri5a_rvb_ali
 treedir=exp/chain/tri6_tree_11000
 lang=data/lang_chain
@@ -241,7 +239,7 @@ if [ $stage -le 14 ]; then
   extra_right_context=$[$chunk_right_context+10]
   # %WER 25.5 | 2120 27212 | 81.0 11.9 7.1 6.5 25.5 75.0 | -1.022 | exp/chain/blstm_asp2/decode_dev_aspire_whole_uniformsegmented_win10_over5_v7_iterfinal_pp_fg/score_8/penalty_0.5/ctm.filt.filt.sys
 
-  local/nnet3/prep_test_aspire.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
+  local/nnet3/decode.sh --stage 4 --decode-num-jobs 30  --affix "v7" \
    --extra-left-context $extra_left_context \
    --extra-right-context $extra_right_context \
    --extra-left-context-initial 0 \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
index f5a7217c804..201f61dc64b 100755
--- a/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_7b.sh
@@ -5,7 +5,6 @@ set -e
 # based on run_tdnn_7b.sh in the swbd recipe
 
 # configs for 'chain'
-affix=
 stage=7 # assuming you already ran the xent systems
 train_stage=-10
 get_egs_stage=-10
@@ -37,7 +36,6 @@ where "nvcc" is installed.
 EOF
 fi
 
-dir=${dir}${affix:+_$affix}
 ali_dir=exp/tri5a_rvb_ali
 treedir=exp/chain/tri6_tree_11000
 lang=data/lang_chain
@@ -223,7 +221,7 @@ fi
 
 if [ $stage -le 14 ]; then
 #%WER 27.8 | 2120 27217 | 78.2 13.6 8.2 6.0 27.8 75.9 | -0.613 | exp/chain/tdnn_7b/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_200jobs_iterfinal_pp_fg/score_9/penalty_0.0/ctm.filt.filt.sys
-  local/nnet3/prep_test_aspire.sh --stage 1 --decode-num-jobs 30 --affix "v7" \
+  local/nnet3/decode.sh --stage 1 --decode-num-jobs 30 --affix "v7" \
    --acwt 1.0 --post-decode-acwt 10.0 \
    --window 10 --overlap 5 \
    --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \
@@ -235,7 +233,7 @@ fi
 #  #Online decoding example
 # %WER 31.5 | 2120 27224 | 74.0 13.0 13.0 5.5 31.5 77.1 | -0.558 | exp/chain/tdnn_7b_online/decode_dev_aspire_whole_uniformsegmented_win10_over5_v9_online_iterfinal_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys
 
-#  local/nnet3/prep_test_aspire_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \
+#  local/nnet3/decode_online.sh --stage 2 --decode-num-jobs 30 --affix "v7" \
 #   --acwt 1.0 --post-decode-acwt 10.0 \
 #   --window 10 --overlap 5 \
 #   --max-count 75 \
diff --git a/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..63d3a7ca988
--- /dev/null
+++ b/egs/aspire/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,312 @@
+#!/bin/bash
+
+set -e
+
+# based on run_tdnn_7b.sh in the swbd recipe
+
+# System                      exp/chain/tdnn_lstm_1a
+# WER on dev_aspire (fg)                    22.9
+# Final train prob                        -0.118
+# Final valid prob                        -0.123
+# Final train prob (xent)                 -1.243
+# Final valid prob (xent)                -1.2350
+# Num-parameters                        49945168
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+test_stage=1
+nj=70
+
+tdnn_affix=_1a
+
+hidden_dim=1024
+cell_dim=1024
+projection_dim=256
+
+# training options
+num_epochs=2
+minibatch_size=64,32
+chunk_left_context=40
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+xent_regularize=0.025
+label_delay=5
+
+# decode options
+extra_left_context=50
+extra_right_context=0
+
+# training options
+num_epochs=4
+remove_egs=false
+common_egs_dir=
+
+num_data_reps=3
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+train_set=train_rvb
+
+gmm_dir=exp/tri5a   # used to get training lattices (for chain supervision)
+treedir=exp/chain/tree_bi_a
+lat_dir=exp/chain/tri5a_${train_set}_lats  # training lattices directory
+dir=exp/chain/tdnn_lstm${tdnn_affix}
+train_data_dir=data/${train_set}_hires
+train_ivector_dir=exp/nnet3/ivectors_${train_set}
+lang=data/lang_chain
+
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage --num-data-reps 3 || exit 1
+
+mkdir -p $dir
+
+norvb_lat_dir=exp/chain/tri5a_train_lats
+
+if [ $stage -le 7 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 30 --cmd "$train_cmd" \
+    --generate-ali-from-lats true data/train \
+    data/lang $gmm_dir $norvb_lat_dir || exit 1;
+  rm $norvb_lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 8 ]; then
+  mkdir -p $lat_dir
+
+  utils/split_data.sh data/${train_set} $nj
+
+  for n in `seq $nj`; do
+    awk '{print $1}' data/${train_set}/split$nj/$n/utt2spk | \
+      perl -ane 's/rev[1-3]_//g' > $lat_dir/uttlist.$n.$nj
+  done
+
+  rm -f $lat_dir/lat_tmp.*.{ark,scp} 2>/dev/null
+
+  norvb_nj=$(cat $norvb_lat_dir/num_jobs)
+  $train_cmd JOB=1:$norvb_nj $lat_dir/log/copy_lattices.JOB.log \
+    lattice-copy "ark:gunzip -c $norvb_lat_dir/lat.JOB.gz |" \
+    ark,scp:$lat_dir/lat_tmp.JOB.ark,$lat_dir/lat_tmp.JOB.scp || exit 1
+
+  for n in `seq 3`; do
+    cat $lat_dir/lat_tmp.*.scp | awk -v n=$n '{print "rev"n"_"$1" "$2}'
+  done > $lat_dir/lat_rvb.scp
+
+  $train_cmd JOB=1:$nj $lat_dir/log/copy_rvb_lattices.JOB.log \
+    lattice-copy \
+    "scp:utils/filter_scp.pl data/${train_set}/split$nj/JOB/utt2spk $lat_dir/lat_rvb.scp |" \
+    "ark:| gzip -c > $lat_dir/lat.JOB.gz" || exit 1
+
+  rm $lat_dir/lat_tmp.* $lat_dir/lat_rvb.scp
+
+  echo $nj > $lat_dir/num_jobs
+
+  for f in cmvn_opts final.mdl splice_opts tree; do
+    cp $norvb_lat_dir/$f $lat_dir/$f
+  done
+fi
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  # we build the tree using clean features (data/train) rather than
+  # the augmented features (data/train_rvb) to get better alignments
+
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate -1 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/train $lang exp/tri5a $treedir || exit 1
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+
+  lstm_opts="decay-time=40"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=$hidden_dim
+
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm3 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+  relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=$hidden_dim
+  relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=$hidden_dim
+  fast-lstmp-layer name=lstm4 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim delay=-3 dropout-proportion=0.0 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm4 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm4 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/aspire-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  mkdir -p $dir/egs
+  touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "$decode_cmd" \
+    --feat.online-ivector-dir $train_ivector_dir \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.00005 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.num-chunk-per-minibatch 64,32 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.max-param-change 2.0 \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.shrink-value 0.99 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.momentum 0.0 \
+    --trainer.deriv-truncate-margin 8 \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0 --generate-egs-scp true" \
+    --egs.chunk-width 160,140,110,80 \
+    --egs.chunk-left-context $chunk_left_context \
+    --egs.chunk-right-context $chunk_right_context \
+    --egs.chunk-left-context-initial 0 \
+    --egs.chunk-right-context-final 0 \
+    --egs.dir "$common_egs_dir" \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir $train_data_dir \
+    --tree-dir $treedir \
+    --lat-dir $lat_dir \
+    --dir $dir  || exit 1;
+fi
+
+graph_dir=$dir/graph_pp
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_pp_test $dir $graph_dir
+fi
+
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+
+  for d in dev_rvb test_rvb; do
+    (
+      if [ ! -f exp/nnet3/ivectors_${d}/ivector_online.scp ]; then
+        steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 30 \
+          data/${d}_hires exp/nnet3/extractor \
+          exp/nnet3/ivectors_${d} || { echo "Failed i-vector extraction for data/${d}_hires"; touch $dir/.error; }
+      fi
+
+      decode_dir=$dir/decode_${d}_pp
+      steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" --config conf/decode.config \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $extra_left_context \
+        --extra-right-context $extra_right_context \
+        --extra-left-context-initial 0 --extra-right-context-final 0 \
+        --frames-per-chunk 160 \
+        --online-ivector-dir exp/nnet3/ivectors_${d} \
+        $graph_dir data/${d}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; }
+    ) &
+  done
+  wait
+
+  if [ -f $dir/.error ]; then
+    echo "Failed decoding."
+    exit 1
+  fi
+fi
+
+if [ $stage -le 16 ]; then
+  # %WER 22.9 | 2083 25834 | 81.6 12.0 6.4 4.5 22.9 70.7 | -0.546 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_uniformsegmented_v9_pp_fg/score_8/penalty_0.0/ctm.filt.filt.sys
+  local/nnet3/decode.sh --stage $test_stage --decode-num-jobs 30 --affix "v9" \
+   --acwt 1.0 --post-decode-acwt 10.0 \
+   --window 10 --overlap 5 --frames-per-chunk 160 \
+   --extra-left-context $extra_left_context \
+   --extra-right-context $extra_right_context \
+   --extra-left-context-initial 0 --extra-right-context-final 0 \
+   --sub-speaker-frames 6000 --max-count 75 --ivector-scale 0.75 \
+   --pass2-decode-opts "--min-active 1000" \
+   dev_aspire data/lang $dir/graph_pp $dir
+fi
+
+if [ $stage -le 17 ]; then
+  # %WER 24.0 | 2083 25820 | 79.9 12.0 8.1 4.0 24.0 71.8 | -0.444 | exp/chain/tdnn_lstm_1a_online/decode_dev_aspire_uniformsegmented_v9_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys
+  local/nnet3/decode_online.sh --stage $test_stage --decode-num-jobs 30 --affix "v9" \
+   --acwt 1.0 --post-decode-acwt 10.0 \
+   --window 10 --overlap 5 --frames-per-chunk 160 \
+   --extra-left-context $extra_left_context \
+   --extra-right-context $extra_right_context \
+   --extra-left-context-initial 0 \
+   --max-count 75 \
+   --pass2-decode-opts "--min-active 1000" \
+   dev_aspire data/lang $dir/graph_pp $dir
+fi
+
+exit 0;
+
diff --git a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
index 05f1752cda5..f4366fef679 100755
--- a/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
+++ b/egs/aspire/s5/local/generate_uniformly_segmented_data_dir.sh
@@ -19,8 +19,8 @@ window=10 # size of the uniform segment
 [ -f ./path.sh ] && . ./path.sh
 . utils/parse_options.sh || exit 1;
 
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 [options] <data-set>"
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <data-set> <out-data-set>"
   echo " Options:"
   echo "    --stage (1|2|3)  # start scoring script from part-way through."
   echo "e.g.:"
@@ -29,52 +29,53 @@ if [ $# -ne 1 ]; then
 fi
 
 data_set=$1
+segmented_data_set=$2
 
 if [ "$data_set" == "dev_aspire" ]; then
   if [ $stage -le 1 ]; then
-    echo "$0 : Creating the data dir with whole recordings without segmentation"
+    echo "$0: Creating the data dir with whole recordings without segmentation"
     # create a whole directory without the segments
-    unseg_dir=data/${data_set}_whole
-    src_dir=data/$data_set
-    mkdir -p $unseg_dir
-    echo "$0 : Creating the $unseg_dir/wav.scp file"
-    cp $src_dir/wav.scp $unseg_dir
+    unseg_dir=data/${data_set}_whole_hires
+    src_dir=data/${data_set}
+    utils/data/convert_data_dir_to_whole.sh $src_dir $unseg_dir
 
-    echo "$0 : Creating the $unseg_dir/reco2file_and_channel file"
+    echo "$0: Creating the $unseg_dir/reco2file_and_channel file"
     cat $unseg_dir/wav.scp | awk '{print $1, $1, "A";}' > $unseg_dir/reco2file_and_channel
-    cat $unseg_dir/wav.scp | awk '{print $1, $1;}' > $unseg_dir/utt2spk
-    utils/utt2spk_to_spk2utt.pl $unseg_dir/utt2spk > $unseg_dir/spk2utt
-
   fi
   data_set=${data_set}_whole
+else
+  utils/copy_data_dir.sh data/$data_set data/${data_set}_hires
 fi
 
-segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap}
 if [ $stage -le 2 ]; then
-  echo "$0 : Generating uniform segments with length $window and overlap $overlap."
-  [ -d data/$segmented_data_set ] && rm -r data/$segmented_data_set
-  utils/copy_data_dir.sh --validate-opts "--no-text" \
-    data/$data_set data/$segmented_data_set
-  cp data/$data_set/reco2file_and_channel data/$segmented_data_set
-
-  local/multi_condition/create_uniform_segments.py \
-    --overlap $overlap --window $window data/$segmented_data_set
-
-  for file in cmvn.scp feats.scp; do
-    rm -f data/$segmented_data_set/$file
-  done
-  utils/validate_data_dir.sh --no-text --no-feats data/$segmented_data_set
+  echo "$0: Extracting features"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
+    --mfcc-config conf/mfcc_hires.conf data/${data_set}_hires
+
+  steps/compute_cmvn_stats.sh data/${data_set}_hires
+
+  utils/fix_data_dir.sh data/${data_set}_hires
+  utils/validate_data_dir.sh --no-text data/${data_set}_hires
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0 : Extracting features for the uniformly segmented dir"
+  echo "$0: Generating uniform segments with length $window and overlap $overlap."
   [ -d data/${segmented_data_set}_hires ] && rm -r data/${segmented_data_set}_hires
-  utils/copy_data_dir.sh --validate-opts "--no-text " \
-    data/${segmented_data_set} data/${segmented_data_set}_hires
+  if [ ! -f data/${data_set}_hires/segments ]; then
+    utils/data/get_segments_for_data.sh data/${data_set}_hires > \
+      data/${data_set}_hires/segments
+  fi
 
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $num_jobs \
-    --mfcc-config conf/mfcc_hires.conf data/${segmented_data_set}_hires
+  mkdir -p data/${segmented_data_set}_hires
+
+  utils/data/get_uniform_subsegments.py \
+    --max-segment-duration=$window \
+    --overlap-duration=$overlap \
+    --max-remaining-duration=$(perl -e "print $window/ 2.0") \
+    data/${data_set}_hires/segments > data/${segmented_data_set}_hires/sub_segments
 
+  utils/data/subsegment_data_dir.sh data/${data_set}_hires \
+    data/${segmented_data_set}_hires/sub_segments data/${segmented_data_set}_hires
   steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires
 
   utils/fix_data_dir.sh data/${segmented_data_set}_hires
diff --git a/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh b/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh
index 72f338e25b1..6dd344463ba 100755
--- a/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh
+++ b/egs/aspire/s5/local/multi_condition/aspire_data_prep.sh
@@ -3,31 +3,32 @@
 # Apache 2.0.
 set -e
 stage=0
-aspire_data=/export/corpora5/ASpIRE/
+# Location of aspire data.
+aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data  # for JHU
+
 mean_rms=0.0417 # determined from the mean rms value of data/train_rvb/mean_rms
 . ./path.sh # Needed for KALDI_ROOT
 
 . utils/parse_options.sh
 
-dev_transcript=$aspire_data/ASpIRE_single_dev_transcript
-dev_audio=$aspire_data/ASpIRE_single_dev
-test_audio=$aspire_data/ASpIRE_single_dev_test
-eval_audio=$aspire_data/ASpIRE_single_eval
-if [ ! -f $aspire_data/glm ]; then
+dev_transcript=$aspire_data/dev_and_dev_test_STM_files
+dev_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev
+test_audio=$aspire_data/dev_and_dev_test_audio/ASpIRE_single_dev_test
+if [ ! -f $aspire_data/my_english.glm ]; then
   echo "Expected to find the glm file, provided in ASpIRE challenge."
   echo "Please provide the glm file in $aspire_data." && exit 1;
 fi
+
 # (1) Get transcripts in one file, and clean them up ..
 tmpdir=`pwd`/data/local/data
 mkdir -p $tmpdir
 if [ $stage -le 0 ]; then
 
-  find $dev_transcript/ -name '*.stm'  > $tmpdir/transcripts.flist
+  find $dev_transcript/ -name 'dev.stm'  > $tmpdir/transcripts.flist
   find $dev_audio/ -name '*.wav'  > $tmpdir/wav.flist
   find $test_audio/ -name '*.wav'  > $tmpdir/wav_test.flist
-  find $eval_audio/ -name '*.wav'  > $tmpdir/wav_eval.flist
 
-  n=`cat $tmpdir/transcripts.flist | wc -l`
+  n=$(awk '{print $1}' $(cat $tmpdir/transcripts.flist) | uniq | wc -l)
   if [ $n -ne 30 ]; then
     echo "Expected to find 30 transcript files in the aspire_single_dev_transcript directory, found $n"
     exit 1;
@@ -42,11 +43,6 @@ if [ $stage -le 0 ]; then
     echo "Expected to find 60 .wav files in the aspire_single_dev_test data, found $n"
     exit 1;
   fi
-  n=`cat $tmpdir/wav_eval.flist | wc -l`
-  if [ $n -ne 120 ]; then
-    echo "Expected to find 120 .wav files in the aspire_single_eval data, found $n"
-    exit 1;
-  fi
 fi
 
 # create the dev_aspire files
@@ -122,12 +118,12 @@ for line in sys.stdin.readlines():
   print '{0} sox --vol {1} {2} -r 8000 -t wav - |'.format(file_id, out_rms, line)
 "| sort -k1,1 -u  > $dev/wav.scp || exit 1;
   cat $dev/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $dev/reco2file_and_channel
-  cp $aspire_data/glm $dev
+  cp $aspire_data/my_english.glm $dev/glm
 fi
 
-# prepare the eval and test data
+# prepare test data
 if [ $stage -le 4 ]; then
-  for dataset in test eval; do
+  for dataset in test ; do
     test=data/${dataset}_aspire
     mkdir -p $test
     for f in `cat $tmpdir/wav_${dataset}.flist`; do
@@ -153,7 +149,7 @@ for line in lines:
     cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/utt2spk
     cat $test/wav.scp |awk '{printf("%s %s\n", $1, $1)}' > $test/spk2utt
     cat $test/wav.scp |awk '{printf("%s %s A\n", $1, $1)}' > $test/reco2file_and_channel
-    cp $aspire_data/glm $test
+    cp $aspire_data/my_english.glm $test/glm
   done
 fi
 
diff --git a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
index 2f5a74a2a51..9345dfc92ef 100755
--- a/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
+++ b/egs/aspire/s5/local/multi_condition/run_nnet2_common.sh
@@ -57,9 +57,6 @@ if [ $stage -le 1 ]; then
       data/${data_dir} data/${data_dir}_rvb
   done
 
-  # create the dev, test and eval sets from the aspire recipe
-  local/multi_condition/aspire_data_prep.sh
-
   # copy the alignments for the newly created utterance ids
   ali_dirs=
   for i in `seq 1 $num_data_reps`; do
diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh b/egs/aspire/s5/local/nnet3/decode.sh
similarity index 85%
rename from egs/aspire/s5/local/nnet3/prep_test_aspire.sh
rename to egs/aspire/s5/local/nnet3/decode.sh
index cb69aaff10b..8f965c51cf1 100755
--- a/egs/aspire/s5/local/nnet3/prep_test_aspire.sh
+++ b/egs/aspire/s5/local/nnet3/decode.sh
@@ -8,7 +8,7 @@
 set -e
 
 # general opts
-iter=final
+iter=
 stage=0
 decode_num_jobs=30
 num_jobs=30
@@ -34,8 +34,10 @@ extra_right_context=0 # change for BLSTM
 frames_per_chunk=50 # change for (B)LSTM
 acwt=0.1 # important to change this when using chain models
 post_decode_acwt=1.0 # important to change this when using chain models
-extra_left_context_initial=-1
-extra_right_context_final=-1
+extra_left_context_initial=0
+extra_right_context_final=0
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
 
 . ./cmd.sh
 [ -f ./path.sh ] && . ./path.sh
@@ -57,31 +59,31 @@ dir=$4 # exp/nnet3/tdnn
 
 model_affix=`basename $dir`
 ivector_dir=exp/nnet3
-ivector_affix=${affix:+_$affix}_chain_${model_affix}_iter$iter
-affix=_${affix}_iter${iter}
-act_data_set=${data_set} # we will modify the data set, when uniformly segmenting it
-                         # so we will keep track of original data set for the glm and stm files
+ivector_affix=${affix:+_$affix}_chain_${model_affix}${iter:+_iter$iter}
+affix=_${affix}${iter:+_iter${iter}}
 
+segmented_data_set=${data_set}_uniformsegmented
 if [ $stage -le 1 ]; then
   local/generate_uniformly_segmented_data_dir.sh  \
-    --overlap $overlap --window $window $data_set
+    --overlap $overlap --window $window $data_set $segmented_data_set
 fi
 
-if [ "$data_set" == "test_aspire" ]; then
+if [[ "$data_set" =~ "test_aspire" ]]; then
   out_file=single_dev_test${affix}_$model_affix.ctm
-elif [ "$data_set" == "eval_aspire" ]; then
+  act_data_set=test_aspire
+elif [[ "$data_set" =~ "eval_aspire" ]]; then
   out_file=single_eval${affix}_$model_affix.ctm
-elif [ "$data_set" ==  "dev_aspire" ]; then
+  act_data_set=eval_aspire
+elif [[ "$data_set" =~  "dev_aspire" ]]; then
   # we will just decode the directory without oracle segments file
   # as we would like to operate in the actual evaluation condition
-  data_set=${data_set}_whole
   out_file=single_dev${affix}_${model_affix}.ctm
+  act_data_set=dev_aspire
+else
+  echo "$0: Unknown data-set $data_set"
+  exit 1
 fi
 
-# uniform segmentation script would have created this dataset
-# so update that script if you plan to change this variable
-segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap}
-
 if [ $stage -le 2 ]; then
   echo "Extracting i-vectors, stage 1"
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj 20 \
@@ -118,7 +120,7 @@ if [ $stage -le 3 ]; then
     --extra-right-context-final $extra_right_context_final \
     --frames-per-chunk "$frames_per_chunk" \
     --online-ivector-dir $ivector_dir/ivectors_${segmented_data_set}${ivector_affix}${ivector_scale_affix}_stage1 \
-    --skip-scoring true --iter $iter \
+    --skip-scoring true ${iter:+--iter $iter} \
     $graph data/${segmented_data_set}_hires ${decode_dir}_stage1;
 fi
 
@@ -130,7 +132,7 @@ if [ $stage -le 4 ]; then
     else
       echo "$0 : Generating vad weights file"
       ivector_extractor_input=${decode_dir}_stage1/weights${affix}.gz
-      local/extract_vad_weights.sh --cmd "$decode_cmd" --iter $iter \
+      local/extract_vad_weights.sh --cmd "$decode_cmd" ${iter:+--iter $iter} \
         data/${segmented_data_set}_hires $lang \
         ${decode_dir}_stage1 $ivector_extractor_input
     fi
@@ -162,8 +164,10 @@ if [ $stage -le 6 ]; then
       --acwt $acwt --post-decode-acwt $post_decode_acwt \
       --extra-left-context $extra_left_context  \
       --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
       --frames-per-chunk "$frames_per_chunk" \
-      --skip-scoring true --iter $iter --lattice-beam $lattice_beam \
+      --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \
       --online-ivector-dir $ivector_dir/ivectors_${segmented_data_set}${ivector_affix} \
      $graph data/${segmented_data_set}_hires ${decode_dir}_tg || touch ${decode_dir}_tg/.error
   [ -f ${decode_dir}_tg/.error ] && echo "$0: Error decoding" && exit 1;
@@ -178,16 +182,13 @@ if [ $stage -le 7 ]; then
 fi
 
 decode_dir=${decode_dir}_fg
-
 if [ $stage -le 8 ]; then
   local/score_aspire.sh --cmd "$decode_cmd" \
-    --min-lmwt 1 --max-lmwt 20 \
+    $score_opts \
     --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \
     --ctm-beam 6 \
-    --iter $iter \
+    ${iter:+--iter $iter} \
     --decode-mbr true \
-    --window $window \
-    --overlap $overlap \
     --tune-hyper true \
     $lang $decode_dir $act_data_set $segmented_data_set $out_file
 fi
diff --git a/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh b/egs/aspire/s5/local/nnet3/decode_online.sh
similarity index 74%
rename from egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh
rename to egs/aspire/s5/local/nnet3/decode_online.sh
index 388eb980839..8a51e36b0a5 100755
--- a/egs/aspire/s5/local/nnet3/prep_test_aspire_online.sh
+++ b/egs/aspire/s5/local/nnet3/decode_online.sh
@@ -1,12 +1,13 @@
 #!/bin/bash
 
 # Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016.  Apache 2.0.
-# This script does online decoding, unlike prep_test_aspire.sh which does 2-pass decoding.
+# This script does online decoding, unlike local/nnet3/decode.sh which does 2-pass decoding with
+# uniform segments.
 
 set -e
 
 # general opts
-iter=final
+iter=
 stage=0
 decode_num_jobs=30
 num_jobs=30
@@ -29,6 +30,9 @@ extra_right_context=0 # change for BLSTM
 frames_per_chunk=50 # change for (B)LSTM
 acwt=0.1 # important to change this when using chain models
 post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
 
 . ./cmd.sh
 [ -f ./path.sh ] && . ./path.sh
@@ -49,30 +53,30 @@ graph=$3 #exp/tri5a/graph_pp
 dir=$4 # exp/nnet3/tdnn
 
 model_affix=`basename $dir`
-affix=_${affix}_iter${iter}
-act_data_set=${data_set} # we will modify the data set, when uniformly segmenting it
-                         # so we will keep track of original data set for the glm and stm files
+affix=_${affix}${iter:+_iter${iter}}
 
+segmented_data_set=${data_set}_uniformsegmented
 if [ $stage -le 1 ]; then
   local/generate_uniformly_segmented_data_dir.sh  \
-    --overlap $overlap --window $window $data_set
+    --overlap $overlap --window $window $data_set $segmented_data_set
 fi
 
-if [ "$data_set" == "test_aspire" ]; then
+if [[ "$data_set" =~ "test_aspire" ]]; then
   out_file=single_dev_test${affix}_$model_affix.ctm
-elif [ "$data_set" == "eval_aspire" ]; then
+  act_data_set=test_aspire
+elif [[ "$data_set" =~ "eval_aspire" ]]; then
   out_file=single_eval${affix}_$model_affix.ctm
-elif [ "$data_set" ==  "dev_aspire" ]; then
+  act_data_set=eval_aspire
+elif [[ "$data_set" =~  "dev_aspire" ]]; then
   # we will just decode the directory without oracle segments file
   # as we would like to operate in the actual evaluation condition
-  data_set=${data_set}_whole
   out_file=single_dev${affix}_${model_affix}.ctm
+  act_data_set=dev_aspire
+else
+  echo "$0: Unknown data-set $data_set"
+  exit 1
 fi
 
-# uniform segmentation script would have created this dataset
-# so update that script if you plan to change this variable
-segmented_data_set=${data_set}_uniformsegmented_win${window}_over${overlap}
-
 if [ $stage -le 2 ]; then
   # If this setup used PLP features, we'd have to give the option --feature-type plp
   # to the script below.
@@ -89,13 +93,15 @@ if [ $stage -le 3 ]; then
       # --frames-per-chunk "$frames_per_chunk"
       #--extra-left-context $extra_left_context  \
       #--extra-right-context $extra_right_context  \
-  steps/online/nnet3/decode.sh  --cmd "$decode_cmd" \
+  steps/online/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" \
       --config conf/decode.config $pass2_decode_opts \
       --acwt $acwt --post-decode-acwt $post_decode_acwt \
+      --extra-left-context-initial $extra_left_context_initial \
       --silence-weight $silence_weight \
       --per-utt true \
-      --skip-scoring true --iter $iter --lattice-beam $lattice_beam \
-     $graph data/${segmented_data_set}_hires ${decode_dir}_tg
+      --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+     $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \
+     { echo "$0: Error decoding" && exit 1; }
 fi
 
 if [ $stage -le 4 ]; then
@@ -109,13 +115,11 @@ fi
 decode_dir=${decode_dir}_fg
 if [ $stage -le 5 ]; then
   local/score_aspire.sh --cmd "$decode_cmd" \
-    --min-lmwt 1 --max-lmwt 20 \
+    $score_opts \
     --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \
     --ctm-beam 6 \
-    --iter $iter \
+    ${iter:+--iter $iter} \
     --decode-mbr true \
-    --window $window \
-    --overlap $overlap \
     --tune-hyper true \
     $lang $decode_dir $act_data_set $segmented_data_set $out_file
 fi
diff --git a/egs/aspire/s5/local/nnet3/run_ivector_common.sh b/egs/aspire/s5/local/nnet3/run_ivector_common.sh
index 416682fe8a7..ea226c230af 100755
--- a/egs/aspire/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/aspire/s5/local/nnet3/run_ivector_common.sh
@@ -8,10 +8,6 @@ snrs="20:10:15:5:0"
 foreground_snrs="20:10:15:5:0"
 background_snrs="20:10:15:5:0"
 num_data_reps=3
-db_string="'air' 'rwcp' 'rvb2014'" # RIR dbs to be used in the experiment
-                                      # only dbs used for ASpIRE submission system have been used here
-RIR_home=db/RIR_databases/ # parent directory of the RIR databases files
-download_rirs=true # download the RIR databases from the urls or assume they are present in the RIR_home directory
 base_rirs="simulated"
 
 set -e
@@ -61,22 +57,9 @@ if [ $stage -le 1 ]; then
       --source-sampling-rate 8000 \
       data/${data_dir} data/${data_dir}_rvb
   done
-  # create the dev, test and eval sets from the aspire recipe
-  local/multi_condition/aspire_data_prep.sh
-
-  # copy the alignments for the newly created utterance ids
-  ali_dirs=
-  for i in `seq 1 $num_data_reps`; do
-    local/multi_condition/copy_ali_dir.sh --cmd "$decode_cmd" --utt-prefix "rev${i}_" exp/tri5a exp/tri5a_temp_$i || exit 1;
-    ali_dirs+=" exp/tri5a_temp_$i"
-  done
-
-  steps/combine_ali_dirs.sh data/train_rvb exp/tri5a_rvb_ali $ali_dirs || exit 1;
-
-  # copy the alignments for training the 100k system (from tri4a)
-  local/multi_condition/copy_ali_dir.sh --utt-prefix "rev1_" exp/tri4a exp/tri4a_rvb || exit 1;
 fi
 
+
 if [ $stage -le 2 ]; then
   mfccdir=mfcc_reverb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
@@ -94,32 +77,22 @@ if [ $stage -le 2 ]; then
     utils/validate_data_dir.sh data/${data_dir}_hires
   done
 
-  # want the 100k subset to exactly match train_100k, since we'll use its alignments.
-  awk -v p='rev1_' '{printf "%s%s\n", p, $1}' data/train_100k/utt2spk > uttlist
-  #while read line; do grep $line data/train_rvb_hires/utt2spk|head -1; done < uttlist |awk '{print $1}' > uttlist2
-  #mv uttlist2 uttlist
-  utils/subset_data_dir.sh --utt-list uttlist \
-    data/train_rvb_hires data/train_rvb_hires_100k
-  rm uttlist
+  utils/subset_data_dir.sh data/train_rvb_hires 100000 data/train_rvb_hires_100k
   utils/subset_data_dir.sh data/train_rvb_hires 30000 data/train_rvb_hires_30k
 fi
 
 if [ $stage -le 3 ]; then
-  # We need to build a small system just because we need the LDA+MLLT transform
-  # to train the diag-UBM on top of.  We use --num-iters 13 because after we get
-  # the transform (12th iter is the last), any further training is pointless.
-  steps/train_lda_mllt.sh --cmd "$train_cmd" --num-iters 13 \
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" \
-    5000 10000 data/train_rvb_hires_100k data/lang exp/tri4a_rvb exp/nnet3/tri5a
+    --max-utts 30000 --subsample 2 \
+    data/train_rvb_hires exp/nnet3/pca_transform
 fi
 
-
 if [ $stage -le 4 ]; then
   # To train a diagonal UBM we don't need very much data, so use the smallest
-  # subset.  the input directory exp/nnet3/tri5a is only needed for
-  # the splice-opts and the LDA transform.
+  # subset.
   steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 --num-frames 400000 \
-    data/train_rvb_hires_30k 512 exp/nnet3/tri5a \
+    data/train_rvb_hires_30k 512 exp/nnet3/pca_transform \
     exp/nnet3/diag_ubm
 fi
 
@@ -133,7 +106,7 @@ if [ $stage -le 5 ]; then
 fi
 
 if [ $stage -le 6 ]; then
-  ivectordir=exp/nnet3/ivectors_train
+  ivectordir=exp/nnet3/ivectors_train_rvb
   if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then # this shows how you can split across multiple file-systems.
     utils/create_split_dir.pl /export/b0{1,2,3,4}/$USER/kaldi-data/ivectors/aspire/s5/$ivectordir/storage $ivectordir/storage
   fi
diff --git a/egs/aspire/s5/local/nnet3/segment_and_decode.sh b/egs/aspire/s5/local/nnet3/segment_and_decode.sh
new file mode 100755
index 00000000000..d66b72200c1
--- /dev/null
+++ b/egs/aspire/s5/local/nnet3/segment_and_decode.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+# Copyright Johns Hopkins University (Author: Daniel Povey, Vijayaditya Peddinti) 2016.  Apache 2.0.
+# This script generates the ctm files for dev_aspire, test_aspire and eval_aspire
+# for scoring with ASpIRE scoring server.
+# It also provides the WER for dev_aspire data.
+
+set -e
+
+# general opts
+iter=
+stage=0
+sad_num_jobs=30
+decode_num_jobs=30
+affix=
+
+# segmentation opts
+sad_affix=
+sad_opts="--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+sad_graph_opts=
+sad_priors_opts=
+sad_stage=0
+
+# ivector opts
+max_count=75  # parameter for extract_ivectors.sh
+sub_speaker_frames=6000
+
+# decode opts
+decode_opts="--min-active 1000"
+lattice_beam=8
+extra_left_context=0 # change for (B)LSTM
+extra_right_context=0 # change for BLSTM
+frames_per_chunk=50 # change for (B)LSTM
+acwt=0.1 # important to change this when using chain models
+post_decode_acwt=1.0 # important to change this when using chain models
+extra_left_context_initial=0
+extra_right_context_final=0
+
+score_opts="--min-lmwt 6 --max-lmwt 13"
+
+. ./cmd.sh
+[ -f ./path.sh ] && . ./path.sh
+. utils/parse_options.sh || exit 1;
+
+if [ $# -ne 6 ]; then
+  echo "Usage: $0 [options] <data-dir> <sad-nnet-dir> <work-dir> <lang-dir> <graph-dir> <model-dir>"
+  echo " Options:"
+  echo "    --stage (0|1|2)   # start scoring script from part-way through."
+  echo "e.g.:"
+  echo "$0 dev_aspire data/lang exp/tri5a/graph_pp exp/nnet3/tdnn"
+  exit 1;
+fi
+
+data_set=$1 #select from {dev_aspire, test_aspire, eval_aspire}*
+sad_nnet_dir=$2
+sad_work_dir=$3
+lang=$4 # data/lang
+graph=$5 #exp/tri5a/graph_pp
+dir=$6 # exp/nnet3/tdnn
+
+model_affix=`basename $dir`
+ivector_root_dir=exp/nnet3
+affix=${affix:+_${affix}}${iter:+_iter${iter}}
+
+if [[ "$data_set" =~ "test_aspire" ]]; then
+  out_file=single_dev_test${affix}_$model_affix.ctm
+  act_data_set=test_aspire
+elif [[ "$data_set" =~ "eval_aspire" ]]; then
+  out_file=single_eval${affix}_$model_affix.ctm
+  act_data_set=eval_aspire
+elif [[ "$data_set" =~  "dev_aspire" ]]; then
+  # we will just decode the directory without oracle segments file
+  # as we would like to operate in the actual evaluation condition
+  out_file=single_dev${affix}_${model_affix}.ctm
+  act_data_set=dev_aspire
+else
+  echo "$0: Unknown data-set $data_set"
+  exit 1
+fi
+
+if [ $stage -le 2 ]; then
+  steps/segmentation/detect_speech_activity.sh \
+    --nj $sad_num_jobs --stage $sad_stage \
+    --affix "$sad_affix" --graph-opts "$sad_graph_opts" \
+    --transform-probs-opts "$sad_priors_opts" $sad_opts \
+    data/$data_set $sad_nnet_dir mfcc_hires $sad_work_dir \
+    $sad_work_dir/${data_set}${sad_affix:+_$sad_affix} || exit 1
+fi
+
+segmented_data_set=${data_set}${sad_affix:+_$sad_affix}
+
+if [ $stage -le 3 ]; then
+  if [ -f data/$act_data_set/ref.rttm ]; then
+    if [ ! -f $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel ]; then
+      awk '{print $2" "1}' $sad_work_dir/${segmented_data_set}_seg/segments | \
+        sort -u > $sad_work_dir/${segmented_data_set}_seg/reco2file_and_channel
+    fi
+
+    steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+      --reco2file-and-channel=${sad_work_dir}/${segmented_data_set}_seg/reco2file_and_channel \
+      ${sad_work_dir}/${segmented_data_set}_seg/{utt2spk,segments,sys.rttm} || exit 1
+
+    export PATH=$PATH:$KALDI_ROOT/tools/sctk/bin
+    md-eval.pl -c 0.25 -r data/dev_aspire/ref.rttm \
+      -s ${sad_work_dir}/${segmented_data_set}_seg/sys.rttm > \
+      ${sad_work_dir}/${segmented_data_set}_seg/md_eval.log 
+  fi
+fi
+
+if [ $stage -le 4 ]; then
+  utils/copy_data_dir.sh $sad_work_dir/${segmented_data_set}_seg \
+    data/${segmented_data_set}_hires
+  steps/compute_cmvn_stats.sh data/${segmented_data_set}_hires
+  utils/fix_data_dir.sh data/${segmented_data_set}_hires
+fi
+
+if [ $stage -le 5 ]; then
+  echo "Extracting i-vectors"
+  # this does offline decoding. 
+  # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+  # up into "sub-speakers" of at least that many frames... can be useful if
+  # acoustic conditions drift over time within the speaker's data.
+  steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $decode_num_jobs \
+    --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+    data/${segmented_data_set}_hires $lang $ivector_root_dir/extractor \
+    $ivector_root_dir/ivectors_${segmented_data_set}
+fi
+
+decode_dir=$dir/decode_${segmented_data_set}${affix}_pp
+if [ $stage -le 6 ]; then
+  echo "Generating lattices"
+  rm -f ${decode_dir}_tg/.error
+  steps/nnet3/decode.sh --nj $decode_num_jobs --cmd "$decode_cmd" --config conf/decode.config \
+      --acwt $acwt --post-decode-acwt $post_decode_acwt $decode_opts \
+      --extra-left-context $extra_left_context  \
+      --extra-right-context $extra_right_context  \
+      --extra-left-context-initial $extra_left_context_initial \
+      --extra-right-context-final $extra_right_context_final \
+      --frames-per-chunk "$frames_per_chunk" \
+      --skip-scoring true ${iter:+--iter $iter} --lattice-beam $lattice_beam \
+      --online-ivector-dir $ivector_root_dir/ivectors_${segmented_data_set} \
+     $graph data/${segmented_data_set}_hires ${decode_dir}_tg || \
+     { echo "$0: Error decoding" && exit 1; }
+fi
+
+if [ $stage -le 7 ]; then
+  echo "Rescoring lattices"
+  steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+    --skip-scoring true \
+    ${lang}_pp_test{,_fg} data/${segmented_data_set}_hires \
+    ${decode_dir}_{tg,fg};
+fi
+
+decode_dir=${decode_dir}_fg
+
+if [ $stage -le 8 ]; then
+  local/score_aspire.sh --cmd "$decode_cmd" \
+    $score_opts \
+    --word-ins-penalties "0.0,0.25,0.5,0.75,1.0" \
+    --ctm-beam 6 \
+    ${iter:+--iter $iter} \
+    --decode-mbr true \
+    --tune-hyper true \
+    $lang $decode_dir $act_data_set $segmented_data_set $out_file
+fi
diff --git a/egs/aspire/s5/local/run_asr_segmentation.sh b/egs/aspire/s5/local/run_asr_segmentation.sh
new file mode 100755
index 00000000000..731b6721a78
--- /dev/null
+++ b/egs/aspire/s5/local/run_asr_segmentation.sh
@@ -0,0 +1,234 @@
+#!/bin/bash
+
+# Copyright  2017  Nagendra Kumar Goel
+#            2017  Vimal Manohar
+# Apache 2.0
+
+# We assume the run.sh has been executed (because we are using model
+# directories like exp/tri4a)
+
+# This script demonstrates nnet3-based speech activity detection for
+# segmentation.
+# This script:
+# 1) Prepares targets (per-frame labels) for a subset of training data 
+#    using GMM models
+# 2) Augments the training data with reverberation and additive noise
+# 3) Trains TDNN+Stats or TDNN+LSTM neural network using the targets 
+#    and augmented data
+# 4) Demonstrates using the SAD system to get segments of dev data and decode
+
+lang=data/lang   # Must match the one used to train the models
+lang_test=data/lang_test  # Lang directory for decoding.
+
+data_dir=data/train_100k
+# Model directory used to align the $data_dir to get target labels for training
+# SAD. This should typically be a speaker-adapted system.
+sat_model_dir=exp/tri4a
+# Model direcotry used to decode the whole-recording version of the $data_dir to
+# get target labels for training SAD. This should typically be a
+# speaker-independent system like LDA+MLLT system.
+model_dir=exp/tri3a
+graph_dir=exp/tri3a/graph   # Graph for decoding whole-recording version of $data_dir.
+                            # If not provided, a new one will be created using $lang_test
+
+# List of weights on labels obtained from alignment;
+# labels obtained from decoding; and default labels in out-of-segment regions
+merge_weights=1.0,0.1,0.5
+
+prepare_targets_stage=-10
+nstage=-10
+train_stage=-10
+test_stage=-10
+num_data_reps=3
+affix=_1a   # For segmentation
+test_affix=1a
+stage=-1
+nj=80
+reco_nj=40
+
+# test options
+test_nj=30
+test_stage=1
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+
+set -e -u -o pipefail
+. utils/parse_options.sh 
+
+if [ $# -ne 0 ]; then
+  exit 1
+fi
+
+dir=exp/segmentation${affix}
+mkdir -p $dir
+
+# See $lang/phones.txt and decide which should be garbage
+garbage_phones="laughter oov"
+silence_phones="sil noise"
+
+for p in $garbage_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/garbage_phones.txt
+
+for p in $silence_phones; do 
+  for a in "" "_B" "_E" "_I" "_S"; do
+    echo "$p$a"
+  done
+done > $dir/silence_phones.txt
+
+if ! cat $dir/garbage_phones.txt $dir/silence_phones.txt | \
+  steps/segmentation/internal/verify_phones_list.py $lang/phones.txt; then
+  echo "$0: Invalid $dir/{silence,garbage}_phones.txt"
+  exit 1
+fi
+
+data_id=$(basename $data_dir)
+whole_data_dir=${data_dir}_whole
+targets_dir=exp/segmentation${affix}/${data_id}_whole_combined_targets_sub3
+
+rvb_data_dir=${whole_data_dir}_rvb_hires
+rvb_targets_dir=${targets_dir}_rvb
+
+if [ $stage -le 0 ]; then
+  utils/data/convert_data_dir_to_whole.sh $data_dir $whole_data_dir
+fi
+
+###############################################################################
+# Extract features for the whole data directory
+###############################################################################
+if [ $stage -le 1 ]; then
+  steps/make_mfcc.sh --nj $reco_nj --cmd "$train_cmd"  --write-utt2num-frames true \
+    $whole_data_dir exp/make_mfcc/${data_id}_whole
+  steps/compute_cmvn_stats.sh $whole_data_dir exp/make_mfcc/${data_id}_whole
+  utils/fix_data_dir.sh $whole_data_dir
+fi
+
+###############################################################################
+# Get feats for the manual segments
+###############################################################################
+if [ $stage -le 2 ]; then
+  if [ ! -f ${data_dir}/segments ]; then
+    utils/data/get_segments_for_data.sh $data_dir > $data_dir/segments
+  fi
+  utils/data/subsegment_data_dir.sh $whole_data_dir ${data_dir}/segments ${data_dir}/tmp
+  cp $data_dir/tmp/feats.scp $data_dir
+
+  # Use recording as the "speaker". This is required by prepare_targets_gmm.sh script.
+  awk '{print $1" "$2}' $data_dir/segments > $data_dir/utt2spk
+  utils/utt2spk_to_spk2utt.pl $data_dir/utt2spk > $data_dir/spk2utt
+fi
+
+if [ $stage -le 3 ]; then
+  steps/segmentation/prepare_targets_gmm.sh --stage $prepare_targets_stage \
+    --train-cmd "$train_cmd" --decode-cmd "$decode_cmd" \
+    --nj $nj --reco-nj $reco_nj --lang-test $lang_test \
+    --garbage-phones-list $dir/garbage_phones.txt \
+    --silence-phones-list $dir/silence_phones.txt \
+    --merge-weights "$merge_weights" \
+    --graph-dir "$graph_dir" \
+    $lang $data_dir $whole_data_dir $sat_model_dir $model_dir $dir
+fi
+
+if [ $stage -le 4 ]; then
+  # Download the package that includes the real RIRs, simulated RIRs, isotropic noises and point-source noises
+  if [ ! -f rirs_noises.zip ]; then
+    wget --no-check-certificate http://www.openslr.org/resources/28/rirs_noises.zip
+    unzip rirs_noises.zip
+  fi
+
+  rvb_opts=()
+  # This is the config for the system using simulated RIRs and point-source noises
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/smallroom/rir_list")
+  rvb_opts+=(--rir-set-parameters "0.5, RIRS_NOISES/simulated_rirs/mediumroom/rir_list")
+  rvb_opts+=(--noise-set-parameters RIRS_NOISES/pointsource_noises/noise_list)
+
+  foreground_snrs="20:10:15:5:0"
+  background_snrs="20:10:15:5:0"
+  # corrupt the data to generate multi-condition data
+  # for data_dir in train dev test; do
+  python steps/data/reverberate_data_dir.py \
+    "${rvb_opts[@]}" \
+    --prefix "rev" \
+    --foreground-snrs $foreground_snrs \
+    --background-snrs $background_snrs \
+    --speech-rvb-probability 0.5 \
+    --pointsource-noise-addition-probability 0.5 \
+    --isotropic-noise-addition-probability 0.7 \
+    --num-replications $num_data_reps \
+    --max-noises-per-minute 4 \
+    --source-sampling-rate 8000 \
+    $whole_data_dir $rvb_data_dir
+fi
+
+if [ $stage -le 5 ]; then
+  steps/make_mfcc.sh --mfcc-config conf/mfcc_hires.conf --nj $nj \
+    ${rvb_data_dir}
+  steps/compute_cmvn_stats.sh ${rvb_data_dir}
+  utils/fix_data_dir.sh $rvb_data_dir
+fi
+
+if [ $stage -le 6 ]; then
+    rvb_targets_dirs=()
+    for i in `seq 1 $num_data_reps`; do
+      steps/segmentation/copy_targets_dir.sh --utt-prefix "rev${i}_" \
+        $targets_dir ${targets_dir}_temp_$i || exit 1
+      rvb_targets_dirs+=(${targets_dir}_temp_$i)
+    done
+
+    steps/segmentation/combine_targets_dirs.sh \
+      $rvb_data_dir ${rvb_targets_dir} \
+      ${rvb_targets_dirs[@]} || exit 1;
+
+    rm -r ${rvb_targets_dirs[@]}
+fi
+
+sad_nnet_dir=exp/segmentation${affix}/tdnn_stats_asr_sad_1a
+#sad_nnet_dir=exp/segmentation${affix}/tdnn_lstm_asr_sad_1a
+#sad_opts="--extra-left-context 70 --extra-right-context 0 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3"
+
+if [ $stage -le 7 ]; then
+  # Train a STATS-pooling network for SAD
+  local/segmentation/tuning/train_stats_asr_sad_1a.sh \
+    --stage $nstage --train-stage $train_stage \
+    --targets-dir ${rvb_targets_dir} \
+    --data-dir ${rvb_data_dir} --affix "1a" || exit 1
+
+  # # Train a TDNN+LSTM network for SAD
+  # local/segmentation/tuning/train_lstm_asr_sad_1a.sh \
+  #   --stage $nstage --train-stage $train_stage \
+  #   --targets-dir ${rvb_targets_dir} \
+  #   --data-dir ${rvb_data_dir} --affix "1a" || exit 1
+fi
+
+if [ ! -f data/dev_aspire/wav.scp ]; then
+  echo "$0: Not evaluating on data/dev_aspire"
+  exit 0
+fi
+
+if [ $stage -le 8 ]; then
+steps/segmentation/convert_utt2spk_and_segments_to_rttm.py \
+  --reco2file-and-channel=data/dev_aspire/reco2file_and_channel \
+  data/dev_aspire/{utt2spk,segments,ref.rttm}
+fi
+
+chain_dir=exp/chain/tdnn_lstm_1a
+
+if [ $stage -le 9 ]; then
+  # Use left and right context options that were used when training
+  # the chain nnet
+  # Increase sil-scale to predict more silence
+  local/nnet3/prep_test_aspire_segmentation.sh --stage $test_stage \
+    --decode-num-jobs $test_nj --affix "${test_affix}" \
+    --sad-opts "$sad_opts" \
+    --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" --sad-priors-opts "--sil-scale=0.1" \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context 50 \
+    --extra-right-context 0 \
+    --extra-left-context-initial 0 --extra-right-context-final 0 \
+   --sub-speaker-frames 6000 --max-count 75 \
+   --decode-opts "--min-active 1000" \
+   dev_aspire $sad_nnet_dir $sad_nnet_dir data/lang $chain_dir/graph_pp $chain_dir
+fi
diff --git a/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh b/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
new file mode 100755
index 00000000000..438cd1f1d5e
--- /dev/null
+++ b/egs/aspire/s5/local/segmentation/tuning/train_lstm_asr_sad_1a.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using LSTM for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+extra_left_context=60
+
+relu_dim=256
+cell_dim=256
+projection_dim=64
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+dropout_schedule='0,0@0.20,0.1@0.50,0'
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a
+
+data_dir=exp/segmentation_1a/train_whole_rvb_hires
+targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_lstm_asr_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  fast-lstmp-layer name=lstm1 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn4 input=Append(-6,0,6,12) add-log-stddev=true dim=$relu_dim
+  fast-lstmp-layer name=lstm2 cell-dim=$cell_dim recurrent-projection-dim=$projection_dim non-recurrent-projection-dim=$projection_dim decay-time=20 delay=-3 dropout-proportion=0.0
+  relu-renorm-layer name=tdnn5 input=Append(-12,0,12,24) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.optimization.shrink-value=0.99 \
+    --trainer.dropout-schedule="$dropout_schedule" \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
diff --git a/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh b/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh
new file mode 100755
index 00000000000..80f9840f160
--- /dev/null
+++ b/egs/aspire/s5/local/segmentation/tuning/train_stats_asr_sad_1a.sh
@@ -0,0 +1,136 @@
+#!/bin/bash
+
+# Copyright 2017   Nagendra Kumar Goel
+#           2018   Vimal Manohar
+# Apache 2.0
+
+# This is a script to train a TDNN for speech activity detection (SAD) 
+# using statistics pooling for long-context information.
+
+stage=0
+train_stage=-10
+get_egs_stage=-10
+egs_opts=
+
+chunk_width=20
+
+# The context is chosen to be around 1 second long. The context at test time
+# is expected to be around the same.
+extra_left_context=79
+extra_right_context=21
+
+relu_dim=256
+
+# training options
+num_epochs=1
+initial_effective_lrate=0.0003
+final_effective_lrate=0.00003
+num_jobs_initial=3
+num_jobs_final=8
+remove_egs=true
+max_param_change=0.2  # Small max-param change for small network
+
+egs_dir=
+nj=40
+
+dir=
+affix=1a2
+
+data_dir=exp/segmentation_1a/train_whole_rvb_hires
+targets_dir=exp/segmentation_1a/train_whole_combined_targets_sub3
+
+. ./cmd.sh
+if [ -f ./path.sh ]; then . ./path.sh; fi
+. ./utils/parse_options.sh
+
+set -o pipefail
+set -u
+
+if [ -z "$dir" ]; then
+  dir=exp/segmentation_1a/tdnn_stats_asr_sad
+fi
+dir=$dir${affix:+_$affix}
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+mkdir -p $dir
+
+samples_per_iter=`perl -e "print int(400000 / $chunk_width)"`
+
+if [ $stage -le 5 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=`feat-to-dim scp:$data_dir/feats.scp -` name=input
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2) affine-transform-file=$dir/configs/lda.mat 
+
+  relu-renorm-layer name=tdnn1 input=lda dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn2 input=Append(-1,0,1,2) dim=$relu_dim add-log-stddev=true
+  relu-renorm-layer name=tdnn3 input=Append(-3,0,3,6) dim=$relu_dim add-log-stddev=true
+  stats-layer name=tdnn3_stats config=mean+count(-99:3:9:99)
+  relu-renorm-layer name=tdnn4 input=Append(tdnn3@-6,tdnn3@0,tdnn3@6,tdnn3@12,tdnn3_stats) add-log-stddev=true dim=$relu_dim
+  stats-layer name=tdnn4_stats config=mean+count(-108:6:18:108)
+  relu-renorm-layer name=tdnn5 input=Append(tdnn4@-12,tdnn4@0,tdnn4@12,tdnn4@24,tdnn4_stats) dim=$relu_dim
+
+  output-layer name=output include-log-softmax=true dim=3 learning-rate-factor=0.1 input=tdnn5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig \
+    --config-dir $dir/configs/
+
+  cat <<EOF >> $dir/configs/vars
+num_targets=3
+EOF
+fi
+
+if [ $stage -le 6 ]; then
+  num_utts=`cat $data_dir/utt2spk | wc -l`
+  # Set num_utts_subset for diagnostics to a reasonable value
+  # of max(min(0.005 * num_utts, 300), 12)
+  num_utts_subset=`perl -e '$n=int($ARGV[0] * 0.005); print ($n > 300 ? 300 : ($n < 12 ? 12 : $n))' $num_utts`
+
+  steps/nnet3/train_raw_rnn.py --stage=$train_stage \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --egs.chunk-width=$chunk_width \
+    --egs.dir="$egs_dir" --egs.stage=$get_egs_stage \
+    --egs.chunk-left-context=$extra_left_context \
+    --egs.chunk-right-context=$extra_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.samples-per-iter=20000 \
+    --trainer.optimization.num-jobs-initial=$num_jobs_initial \
+    --trainer.optimization.num-jobs-final=$num_jobs_final \
+    --trainer.optimization.initial-effective-lrate=$initial_effective_lrate \
+    --trainer.optimization.final-effective-lrate=$final_effective_lrate \
+    --trainer.rnn.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.5 \
+    --trainer.deriv-truncate-margin=10 \
+    --trainer.max-param-change=$max_param_change \
+    --trainer.compute-per-dim-accuracy=true \
+    --cmd="$decode_cmd" --nj $nj \
+    --cleanup=true \
+    --cleanup.remove-egs=$remove_egs \
+    --cleanup.preserve-model-interval=10 \
+    --use-gpu=true \
+    --use-dense-targets=true \
+    --feat-dir=$data_dir \
+    --targets-scp="$targets_dir/targets.scp" \
+    --egs.opts="--frame-subsampling-factor 3 --num-utts-subset $num_utts_subset" \
+    --dir=$dir || exit 1
+fi
+
+if [ $stage -le 7 ]; then
+  # Use a subset to compute prior over the output targets
+  $train_cmd $dir/log/get_priors.log \
+    matrix-sum-rows "scp:utils/subset_scp.pl --quiet 1000 $targets_dir/targets.scp |" \
+    ark:- \| vector-sum --binary=false ark:- $dir/post_output.vec || exit 1
+
+  echo 3 > $dir/frame_subsampling_factor
+fi
diff --git a/egs/aspire/s5/run.sh b/egs/aspire/s5/run.sh
index 7b06981079f..851363a7532 100755
--- a/egs/aspire/s5/run.sh
+++ b/egs/aspire/s5/run.sh
@@ -9,8 +9,14 @@
 . ./path.sh
 
 mfccdir=`pwd`/mfcc
+
 set -e
 
+# Set this to somewhere where you want to put your aspire data, or where
+# someone else has already put it.  You'll want to change this
+# if you're not on the CLSP grid.
+aspire_data=/export/corpora/LDC/LDC2017S21/IARPA-ASpIRE-Dev-Sets-v2.0/data  # JHU
+
 # the next command produces the data in local/train_all
 local/fisher_data_prep.sh /export/corpora3/LDC/LDC2004T19 /export/corpora3/LDC/LDC2005T19 \
    /export/corpora3/LDC/LDC2004S13 /export/corpora3/LDC/LDC2005S13
@@ -159,30 +165,59 @@ steps/train_sat.sh  --cmd "$train_cmd" \
 # build silprob lang directory
 local/build_silprob.sh
 
-# train the neural network model
-local/multi_condition/run_nnet2_ms.sh
-
- local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
-   --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
-   --ivector-scale 0.75 --affix v6 --tune-hyper true dev_aspire data/lang exp/nnet2_multicondition/nnet_ms_a
-# %WER 30.8 | 2120 27213 | 75.3 16.2 8.4 6.2 30.8 78.8 | -0.724 | exp/nnet2_multicondition/nnet_ms_a/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_iterfinal_pp_fg/score_13/penalty_0.0/ctm.filt.filt.sys
-
- local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
-   --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
-   --ivector-scale 0.75 --affix v6 --tune-hyper true test_aspire data/lang exp/nnet2_multicondition/nnet_ms_a
-# 72.3 on leaderboard
-
-# discriminative training. Helped on dev, but not on dev_test
-local/multi_condition/run_nnet2_ms_disc.sh
- local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
-   --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
-   --ivector-scale 0.75 --affix v6 --tune-hyper true dev_aspire data/lang exp/nnet2_multicondition/nnet_ms_a_smbr_0.00015_nj12
- #%WER 29.1 | 2120 27208 | 77.6 15.4 7.0 6.7 29.1 77.1 | -1.357 | exp/nnet2_multicondition/nnet_ms_c_prior_adjusted_smbr_0.00015_nj12/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_iterepoch2_pp_fg/score_16/penalty_1.0/ctm.filt.filt.sys
-
- local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
-   --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
-   --ivector-scale 0.75 --affix v6 --tune-hyper true test_aspire data/lang exp/nnet2_multicondition/nnet_ms_a_smbr_0.00015_nj12
- # around 71.5, as models changed after server closed
-
+local/multi_condition/aspire_data_prep.sh --aspire-data $aspire_data
 
 # see local/{chain,nnet3}/* for nnet3 scripts
+
+# train the neural network model
+local/chain/run_tdnn.sh
+
+local/chain/run_tdnn_lstm.sh
+# %WER 22.9 | 2083 25834 | 81.6 12.0 6.4 4.5 22.9 70.7 | -0.546 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_uniformsegmented_pp_fg/score_8/penalty_0.0/ctm.filt.filt.sys
+# %WER 24.0 | 2083 25820 | 79.9 12.0 8.1 4.0 24.0 71.8 | -0.444 | exp/chain/tdnn_lstm_1a_online/decode_dev_aspire_uniformsegmented_v9_pp_fg/score_10/penalty_0.0/ctm.filt.filt.sys
+
+# Train speech activity detection system using TDNN+Stats
+local/run_asr_segmentation.sh
+
+sad_nnet_dir=exp/segmentation_1a/tdnn_stats_asr_sad_1a
+chain_dir=exp/chain/tdnn_lstm_1a
+
+# %WER 22.9 | 2083 25821 | 81.9 11.1 7.0 4.9 22.9 71.8 | -0.488 | exp/chain/tdnn_lstm_1a/decode_dev_aspire_asr_sad_1a_pp_fg/score_10/penalty_0.25/ctm.filt.filt.sys
+local/nnet3/segment_and_decode.sh --stage 1 --decode-num-jobs 30 --affix "" \
+  --acwt 1.0 --post-decode-acwt 10.0 --frames-per-chunk 160 \
+  --extra-left-context 50 --extra-right-context 0 \
+  --extra-left-context-initial 0 --extra-right-context-final 0 \
+  --sub-speaker-frames 6000 --max-count 75 \
+  --decode-opts '--min-active 1000' \
+  --sad-affix asr_sad_1a \
+  --sad-opts "--extra-left-context 79 --extra-right-context 21 --frames-per-chunk 150 --extra-left-context-initial 0 --extra-right-context-final 0 --acwt 0.3" \
+  --sad-graph-opts "--min-silence-duration=0.03 --min-speech-duration=0.3 --max-speech-duration=10.0" \
+  --sad-priors-opts "--sil-scale=0.1" \
+  dev_aspire $sad_nnet_dir $sad_nnet_dir \
+  data/lang $chain_dir/graph_pp $chain_dir
+
+# Old nnet2-based systems are in the comments below. The results here are
+# not applicable to the latest dev set from LDC.
+
+# local/multi_condition/run_nnet2_ms.sh
+# 
+# local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
+#  --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
+#  --ivector-scale 0.75 --affix v6 --tune-hyper true dev_aspire data/lang exp/nnet2_multicondition/nnet_ms_a
+# 
+# local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
+#  --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
+#  --ivector-scale 0.75 --affix v6 --tune-hyper true test_aspire data/lang exp/nnet2_multicondition/nnet_ms_a
+# # 72.3 on leaderboard
+# 
+# # discriminative training. Helped on dev, but not on dev_test
+# local/multi_condition/run_nnet2_ms_disc.sh
+# local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
+#  --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
+#  --ivector-scale 0.75 --affix v6 --tune-hyper true dev_aspire data/lang exp/nnet2_multicondition/nnet_ms_a_smbr_0.00015_nj12
+# #%WER 29.1 | 2120 27208 | 77.6 15.4 7.0 6.7 29.1 77.1 | -1.357 | exp/nnet2_multicondition/nnet_ms_c_prior_adjusted_smbr_0.00015_nj12/decode_dev_aspire_whole_uniformsegmented_win10_over5_v6_iterepoch2_pp_fg/score_16/penalty_1.0/ctm.filt.filt.sys
+# 
+# local/multi_condition/prep_test_aspire.sh --stage 1 --decode-num-jobs 200 \
+#  --sub-speaker-frames 6000 --window 10 --overlap 5 --max-count 75 --pass2-decode-opts "--min-active 1000" \
+#  --ivector-scale 0.75 --affix v6 --tune-hyper true test_aspire data/lang exp/nnet2_multicondition/nnet_ms_a_smbr_0.00015_nj12
+# # around 71.5, as models changed after server closed
diff --git a/egs/swbd/s5c/local/run_asr_segmentation.sh b/egs/swbd/s5c/local/run_asr_segmentation.sh
index 6d935616225..af7d3a428ba 100755
--- a/egs/swbd/s5c/local/run_asr_segmentation.sh
+++ b/egs/swbd/s5c/local/run_asr_segmentation.sh
@@ -7,6 +7,16 @@
 # We assume the run.sh has been executed (because we are using model
 # directories like exp/tri4)
 
+# This script demonstrates nnet3-based speech activity detection for
+# segmentation.
+# This script:
+# 1) Prepares targets (per-frame labels) for a subset of training data 
+#    using GMM models
+# 2) Augments the training data with reverberation and additive noise
+# 3) Trains TDNN+Stats or TDNN+LSTM neural network using the targets 
+#    and augmented data
+# 4) Demonstrates using the SAD system to get segments of eval data and decode
+
 lang=data/lang   # Must match the one used to train the models
 lang_test=data/lang_nosp_sw1_tg  # Lang directory for decoding.
 
@@ -21,8 +31,8 @@ model_dir=exp/tri3
 graph_dir=    # Graph for decoding whole-recording version of $data_dir.
               # If not provided, a new one will be created using $lang_test
 
-# List of weights on labels obtained from alignment, 
-# labels obtained from decoding and default labels in out-of-segment regions
+# List of weights on labels obtained from alignment;
+# labels obtained from decoding; and default labels in out-of-segment regions
 merge_weights=1.0,0.1,0.5
 
 prepare_targets_stage=-10
diff --git a/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py b/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py
index da9e655a404..e2a76d1a830 100755
--- a/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py
+++ b/egs/wsj/s5/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py
@@ -92,7 +92,7 @@ def main():
             duration = float(parts[3]) - start_time
 
             print("SPEAKER {0} {1} {2:7.2f} {3:7.2f} "
-                  "<NA> <NA> {4} <NA>\n".format(
+                  "<NA> <NA> {4} <NA>".format(
                       file_id, channel, start_time,
                       duration, spkr), file=rttm_writer)
 
diff --git a/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py b/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py
index dcf04d4cb3b..8a71e911c16 100755
--- a/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py
+++ b/egs/wsj/s5/steps/segmentation/internal/get_transform_probs_mat.py
@@ -38,8 +38,9 @@ def get_args():
                         help="The fraction of garbage probability "
                         "to add to silence")
     parser.add_argument("--sil-scale", type=float,
-			default=1.0,
-			help="Scale on the silence probability (make this more that one to encourage decoding silence).")
+                        default=1.0, help="""Scale on the silence probability
+                        (make this more than one to encourage
+                        decoding silence).""")
 
     args = parser.parse_args()
 
@@ -47,7 +48,7 @@ def get_args():
 
 
 def run(args):
-    priors = [[1, 1, 1]]
+    priors = [[1.0, 1.0, 1.0]]
     if args.priors is not None:
         priors = common_lib.read_matrix_ascii(args.priors)
         if len(priors) != 0 and len(priors[0]) != 3:
diff --git a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
index fc0aea3eb12..12c9bb1e902 100755
--- a/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
+++ b/egs/wsj/s5/steps/segmentation/internal/prepare_sad_graph.py
@@ -38,7 +38,8 @@ def get_args():
         and minimum silence duration constraint. The graph is written to the
         'output_graph', which can be file or "-" for stdout.  for segmentation
         with minimum and maximum speech duration constraints and minimum silence
-        duration constraint.""")
+        duration constraint.""",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
     parser.add_argument("--transition-scale", type=float, default=1.0,
                         help="""Scale on transition probabilities relative to

From 9a4ba5ec094fc0c407774c4e21cb2ed54b0e07b3 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 28 Feb 2018 20:14:12 -0500
Subject: [PATCH 150/184] [src,scripts,egs] Add form of dropout that shares the
 mask across frames. (#2244)

---
 .../s5/local/chain/run_tdnn.sh                |   2 +-
 .../s5/local/chain/tuning/run_tdnn_1g.sh      | 311 +++++++++
 .../s5c/local/chain/tuning/run_tdnn_7m25l.sh  | 608 ++++++++++++++++++
 egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh  |   2 +-
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  30 +-
 egs/wsj/s5/steps/nnet3/chain/train.py         |   3 +-
 src/chain/chain-denominator.cc                |  13 +-
 src/chain/chain-training.cc                   |  11 +-
 src/cudamatrix/cu-kernels-ansi.h              |   6 +
 src/cudamatrix/cu-kernels.cu                  |  29 +
 src/cudamatrix/cu-kernels.h                   |  18 +-
 src/cudamatrix/cu-matrix-test.cc              |  37 ++
 src/cudamatrix/cu-matrix.cc                   |  35 +
 src/cudamatrix/cu-matrix.h                    |   9 +
 src/nnet3/nnet-chain-training.cc              |  87 ++-
 src/nnet3/nnet-component-itf.cc               |   4 +
 src/nnet3/nnet-component-itf.h                |   2 +-
 src/nnet3/nnet-component-test.cc              |   2 +-
 src/nnet3/nnet-computation.cc                 |   4 +-
 src/nnet3/nnet-compute.cc                     |   2 +-
 src/nnet3/nnet-general-component.cc           | 342 +++++++++-
 src/nnet3/nnet-general-component.h            | 177 ++++-
 src/nnet3/nnet-normalize-component.h          |   6 +-
 src/nnet3/nnet-simple-component.h             |   2 +-
 src/nnet3/nnet-training.cc                    |   4 +-
 src/nnet3/nnet-utils.cc                       |  35 +-
 src/nnet3/nnet-utils.h                        |   5 +-
 27 files changed, 1709 insertions(+), 77 deletions(-)
 create mode 100755 egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
 create mode 100755 egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh

diff --git a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
index cb5756188a4..cebb2b84f16 120000
--- a/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
+++ b/egs/mini_librispeech/s5/local/chain/run_tdnn.sh
@@ -1 +1 @@
-tuning/run_tdnn_1f.sh
\ No newline at end of file
+tuning/run_tdnn_1g.sh
\ No newline at end of file
diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
new file mode 100755
index 00000000000..e234b847aa7
--- /dev/null
+++ b/egs/mini_librispeech/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -0,0 +1,311 @@
+#!/bin/bash
+
+# 1g is as 1f but adding dropout (well, something like dropout-- the mask
+#   is shared across time and it's continuous rather than zero-one), increasing
+#   the hidden dimension, and training for more epochs.
+
+# local/chain/compare_wer.sh --online exp/chain/tdnn1f_sp exp/chain/tdnn1g_sp
+# System                tdnn1f_sp tdnn1g_sp
+#WER dev_clean_2 (tgsmall)      14.21     13.76
+#             [online:]         14.18     13.72
+#WER dev_clean_2 (tglarge)      10.32      9.65
+#             [online:]         10.25      9.85
+# Final train prob        -0.0507   -0.0453
+# Final valid prob        -0.0912   -0.0892
+# Final train prob (xent)   -1.3550   -1.1694
+# Final valid prob (xent)   -1.6018   -1.4486
+# Num-params                 4205322   6227338
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{f,g}_sp
+# exp/chain/tdnn1f_sp: num-iters=17 nj=2..5 num-params=4.2M dim=40+100->2309 combine=-0.060->-0.060 (over 1) xent:train/valid[10,16,final]=(-1.61,-1.41,-1.36/-1.82,-1.66,-1.60) logprob:train/valid[10,16,final]=(-0.067,-0.057,-0.051/-0.106,-0.097,-0.091)
+# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2309 combine=-0.054->-0.053 (over 2) xent:train/valid[15,24,final]=(-1.49,-1.22,-1.17/-1.75,-1.51,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.050,-0.045/-0.106,-0.096,-0.089)
+
+
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=10
+train_set=train_clean_5
+test_sets=dev_clean_2
+gmm=tri3b
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+affix=1g   # affix for the TDNN directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 3500 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.05 dropout-per-dim-continuous=true"
+  output_opts="l2-regularize=0.02 bottleneck-dim=192"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-2,-1,0,1,2,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn2 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn4 $opts dim=512 input=Append(-1,0,1)
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=512
+  relu-batchnorm-dropout-layer name=tdnn6 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts dim=512 input=Append(-3,0,3)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts dim=512 input=Append(-6,-3,0)
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain $opts dim=512
+  output-layer name=output include-log-softmax=false $output_opts dim=$num_targets max-change=1.5
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn8 $opts dim=512
+  output-layer name=output-xent $output_opts dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=15 \
+    --trainer.frames-per-iter=3000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=5 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=256,128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test_tgsmall \
+    $tree_dir $tree_dir/graph_tgsmall || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_tgsmall data/${data}_hires ${dir}/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+# Not testing the 'looped' decoding separately, because for
+# TDNN systems it would give exactly the same results as the
+# normal decoding.
+
+if $test_online_decoding && [ $stage -le 17 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+    --mfcc-config conf/mfcc_hires.conf \
+    $lang exp/nnet3${nnet3_affix}/extractor ${dir} ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      # note: we just give it "data/${data}" as it only uses the wav.scp, the
+      # feature type does not matter.
+      steps/online/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --nj $nspk --cmd "$decode_cmd" \
+        $tree_dir/graph_tgsmall data/${data} ${dir}_online/decode_tgsmall_${data} || exit 1
+      steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+        data/lang_test_{tgsmall,tglarge} \
+       data/${data}_hires ${dir}_online/decode_{tgsmall,tglarge}_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
new file mode 100755
index 00000000000..0fa7353edb2
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7m25l.sh
@@ -0,0 +1,608 @@
+#!/bin/bash
+
+# 7m25l is as 7m25j but with no dropout on the prefinal layer.  Hoping to resolve
+# bad objf in middle of training.
+# Caution: in 7m25l2 there is a run which by mistake, did have dropout on the
+# prefinal layer, and which should for the most part be just a rerun of 7m25j.
+
+# This seems *maybe* slightly better than j and l2 (note: l2 is like j).
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25j_sp tdnn7m25l2_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     11.95     11.98     11.90
+# WER on train_dev(fg)      11.12     11.08     11.00     10.92
+# WER on eval2000(tg)        14.9      14.6      14.7      14.7
+# WER on eval2000(fg)        13.5      13.3      13.3      13.3
+# WER on rt03(tg)            18.4      18.1      18.1      18.0
+# WER on rt03(fg)            16.2      15.8      15.8      15.7
+# Final train prob         -0.077    -0.078    -0.078    -0.076
+# Final valid prob         -0.093    -0.091    -0.091    -0.091
+# Final train prob (xent)        -0.994    -0.987    -0.987    -0.973
+# Final valid prob (xent)       -1.0194   -1.0161   -1.0142   -1.0041
+# Num-parameters               20111396  22735140  22735140  22735140
+
+#
+# But I may have changed the training code to accept more models in averaging,
+# so that could be responsible for some of the change.
+#
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# System                tdnn7m23t_sp tdnn7m25i_sp tdnn7m25l_sp
+# WER on train_dev(tg)      12.18     12.13     11.98
+# WER on train_dev(fg)      11.12     11.22     11.00
+# WER on eval2000(tg)        14.9      15.0      14.7
+# WER on eval2000(fg)        13.5      13.7      13.3
+# WER on rt03(tg)            18.4      18.2      18.1
+# WER on rt03(fg)            16.2      15.7      15.8
+# Final train prob         -0.077    -0.078    -0.078
+# Final valid prob         -0.093    -0.092    -0.091
+# Final train prob (xent)        -0.994    -0.996    -0.987
+# Final valid prob (xent)       -1.0194   -1.0214   -1.0142
+# Num-parameters               20111396  22735140  22735140
+
+# 7m25j is as 7m25i but with the dropout schedule peaking at 0.5 not 0.3,
+#   and with 8 instead of 6 epochs (like g->h).
+#   This run failed due to instability.
+
+# 7m25i is as 7m25g but with dropout-per-dim-continuous=true.
+#
+# 7m25g is as 7m25f but with dim=1536 for the subsampled layers (more like 7m25d than 7m25e).
+
+# 7m25f is as 7m25e but with a dropout schedule borrowed from the LSTM experiments.
+#
+# 7m25e is as 7m25d but reverting dims back from 1536 to 1280.
+
+# 7m25d is as 7m25c but reverting to sharing the linear layer before the
+# prefinal layer (more like 7m23t{,2}).  Also changing one splicing input
+# to be from a layer that wasn't otherwise used as splicing input.
+
+# 7m25c is as 7m25b but for the layers after we start using 3's not 1's,
+#  increasing dim from 1280 to 1536.
+# 7m25b is as 7m25a but with slightly different skip connections,
+#  so all layers are the sources of skip connections.  (Also see 7m23u, although
+#  that experiment didn't give clear results).
+# 7m25a is as 7m23t but with some renamings of layers to make it more
+# understandable, and changing how the last layer is done (there's now a little
+# bit less sharing).
+
+# 7m23t is as 7m23r but with 1280 instead of 1536 as the dim.
+# Differernce vs. 23r is unclear (maybe slightly worse), but it
+# seems slightly better than 23h, and it's nice that it has fewer parameters.
+
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# System                tdnn7m23h_sp tdnn7m23r_sp tdnn7m23t_sp
+# WER on train_dev(tg)      12.28     11.95     12.18
+# WER on train_dev(fg)      11.21     10.97     11.12
+# WER on eval2000(tg)        15.0      15.0      14.9
+# WER on eval2000(fg)        13.5      13.6      13.5
+# WER on rt03(tg)            18.5      18.4      18.4
+# WER on rt03(fg)            16.1      15.9      16.2
+# Final train prob         -0.083    -0.076    -0.077
+# Final valid prob         -0.097    -0.091    -0.093
+# Final train prob (xent)        -1.036    -0.978    -0.994
+# Final valid prob (xent)       -1.0629   -1.0026   -1.0194
+# Num-parameters               23513380  23513380  20111396
+
+# 7m23r is as 7m23h but with 6 epochs instead of 4.  See also 7m23p, which
+# had 3 epochs.
+
+# 7m23h is as 7m23b2 but with a small bugfix, removing a stray 'bottleneck-dim=192'.
+# Seems slightly better.  The comparison below  includes our old TDNN+LSTM result
+# with dropout, to show that we're doing better than that now.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# System                tdnn_lstm_1m_ld5_sp tdnn7m23b2_sp tdnn7m23h_sp
+# WER on train_dev(tg)      12.33     12.38     12.28
+# WER on train_dev(fg)      11.42     11.44     11.21
+# WER on eval2000(tg)        15.2      15.1      15.0
+# WER on eval2000(fg)        13.8      13.6      13.5
+# WER on rt03(tg)            18.6      18.4      18.5
+# WER on rt03(fg)            16.3      16.1      16.1
+# Final train prob         -0.082    -0.084    -0.083
+# Final valid prob         -0.099    -0.098    -0.097
+# Final train prob (xent)        -0.959    -1.049    -1.036
+# Final valid prob (xent)       -1.0305   -1.0661   -1.0629
+# Num-parameters               39558436  23120164  23513380
+#
+# 7m23b2 is as 7m23b but fixing an issue at the last layers.
+# 7m23b is as 7m23 but making the splicing more 'symmetric'... doing the
+#  splicing in 2 stages.  Interestingly, objf is not better than 23, but
+# WER is slightly better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# System                tdnn7m19m_sp tdnn7m23_sp tdnn7m23b2_sp
+# WER on train_dev(tg)      12.55     12.23     12.38
+# WER on train_dev(fg)      11.52     11.29     11.44
+# WER on eval2000(tg)        15.2      15.2      15.1
+# WER on eval2000(fg)        13.6      13.7      13.6
+# WER on rt03(tg)            18.6      18.7      18.4
+# WER on rt03(fg)            16.2      16.3      16.1
+# Final train prob         -0.089    -0.083    -0.084
+# Final valid prob         -0.101    -0.097    -0.098
+# Final train prob (xent)        -1.080    -1.025    -1.049
+# Final valid prob (xent)       -1.0990   -1.0548   -1.0661
+# Num-parameters               21055012  23120164  23120164
+
+
+# 7m23 is as 7m19m but removing the bottlenecks from the batchnorm components and
+#  reducing the dim of the linear components... it's basically an attempt to
+#  reverse the factorization to have the splicing at a different point.
+#
+
+# 7m19m is as 7m19l but with more skip connections
+#   Hm-- seems better than 19h.
+#
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp tdnn7m19m_sp
+# WER on train_dev(tg)      12.61     12.72     12.55
+# WER on train_dev(fg)      11.72     11.62     11.52
+# WER on eval2000(tg)        15.4      15.4      15.2
+# WER on eval2000(fg)        13.7      13.8      13.6
+# WER on rt03(tg)            18.9      18.9      18.6
+# WER on rt03(fg)            16.3      16.4      16.2
+# Final train prob         -0.091    -0.091    -0.089
+# Final valid prob         -0.102    -0.103    -0.101
+# Final train prob (xent)        -1.098    -1.095    -1.080
+# Final valid prob (xent)       -1.1031   -1.1191   -1.0990
+# Num-parameters               21055012  20268580  21055012
+#
+# 7m19l is as 7m19h but projecting down to an intermediate dim (512) before
+# doing the Append... doing this by inserting a linear-component between
+# pairs of relu-batchnorm-layers.
+#  A little worse.
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19h_sp tdnn7m19l_sp
+# System                tdnn7m19h_sp tdnn7m19l_sp
+# WER on train_dev(tg)      12.65     12.72
+# WER on train_dev(fg)      11.57     11.62
+# WER on eval2000(tg)        15.3      15.4
+# WER on eval2000(fg)        13.7      13.8
+# WER on rt03(tg)            18.8      18.9
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.091    -0.091
+# Final valid prob         -0.102    -0.103
+# Final train prob (xent)        -1.091    -1.095
+# Final valid prob (xent)       -1.1064   -1.1191
+# Num-parameters               21055012  20268580
+
+
+# 7m19h is as 7m19e but with an extra bypass connection.  A bit better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19e_sp tdnn7m19h_sp
+# System                tdnn7m19e_sp tdnn7m19h_sp
+# WER on train_dev(tg)      12.75     12.65
+# WER on train_dev(fg)      11.77     11.57
+# WER on eval2000(tg)        15.5      15.3
+# WER on eval2000(fg)        14.0      13.7
+# WER on rt03(tg)            18.9      18.8
+# WER on rt03(fg)            16.4      16.4
+# Final train prob         -0.092    -0.091
+# Final valid prob         -0.102    -0.102
+# Final train prob (xent)        -1.094    -1.091
+# Final valid prob (xent)       -1.1095   -1.1064
+# Num-parameters               20760100  21055012
+
+# 7m19e is as 7m19c,d but with dims increased to 1536.  Better!
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# System                tdnn7m10_sp tdnn7m19c_sp tdnn7m19d_sp tdnn7m19e_sp
+# WER on train_dev(tg)      13.77     12.86     13.01     12.75
+# WER on train_dev(fg)      12.65     11.82     12.02     11.77
+# WER on eval2000(tg)        16.1      15.4      15.7      15.5
+# WER on eval2000(fg)        14.3      13.8      14.0      14.0
+# WER on rt03(tg)            19.9      19.1      19.2      18.9
+# WER on rt03(fg)            17.4      16.6      16.7      16.4
+# Final train prob         -0.111    -0.094    -0.096    -0.092
+# Final valid prob         -0.120    -0.103    -0.105    -0.102
+# Final train prob (xent)        -1.314    -1.117    -1.144    -1.094
+# Final valid prob (xent)       -1.3247   -1.1223   -1.1478   -1.1095
+# Num-parameters               13361700  17824036  14887972  20760100
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# System                tdnn7m16_sp tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp tdnn7m19d_sp
+# WER on train_dev(tg)      13.37     13.09     12.93     12.86     13.01
+# WER on train_dev(fg)      12.47     12.12     11.87     11.82     12.02
+# WER on eval2000(tg)        15.8      15.8      15.6      15.4      15.7
+# WER on eval2000(fg)        14.3      14.3      14.0      13.8      14.0
+# WER on rt03(tg)            15.1      14.8      14.9      14.8      14.9
+# WER on rt03(fg)            12.7      12.4      12.5      12.5      12.6
+# Final train prob         -0.099    -0.096    -0.096    -0.094    -0.096
+# Final valid prob         -0.110    -0.106    -0.106    -0.103    -0.105
+# Final train prob (xent)        -1.302    -1.198    -1.188    -1.117    -1.144
+# Final valid prob (xent)       -1.3184   -1.2070   -1.1980   -1.1223   -1.1478
+# Num-parameters               14216996  15528996  16512036  17824036  14887972
+
+# 7m19c is as 7m19b but with one more layer (and moving the bypass connections up).
+#  Seems about 0.1% better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# System                tdnn7m19_sp tdnn7m19b_sp tdnn7m19c_sp
+# WER on train_dev(tg)      13.09     12.93     12.86
+# WER on train_dev(fg)      12.12     11.87     11.82
+# WER on eval2000(tg)        15.8      15.6      15.4
+# WER on eval2000(fg)        14.3      14.0      13.8
+# WER on rt03(tg)            14.8      14.9      14.8
+# WER on rt03(fg)            12.4      12.5      12.5
+# Final train prob         -0.096    -0.096    -0.094
+# Final valid prob         -0.106    -0.106    -0.103
+# Final train prob (xent)        -1.198    -1.188    -1.117
+# Final valid prob (xent)       -1.2070   -1.1980   -1.1223
+# Num-parameters               15528996  16512036  17824036
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m19_sp tdnn7m19b_sp
+# System                tdnn7m19_sp tdnn7m19b_sp
+# WER on train_dev(tg)      13.09     12.93
+# WER on train_dev(fg)      12.12     11.87
+# WER on eval2000(tg)        15.8      15.6
+# WER on eval2000(fg)        14.3      14.0
+# WER on rt03(tg)            14.8      14.9
+# WER on rt03(fg)            12.4      12.5
+# Final train prob         -0.096    -0.096
+# Final valid prob         -0.106    -0.106
+# Final train prob (xent)        -1.198    -1.188
+# Final valid prob (xent)       -1.2070   -1.1980
+# Num-parameters               15528996  16512036
+
+# 7m19 is as 7m16 but adding an extra -3,0,3 layer.
+# CAUTION: messing with queue opts.
+# 7m16 is as 7m15 but removing the chain l2-regularize.  Does seem better.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# System                tdnn7m12_sp tdnn7m15_sp tdnn7m16_sp
+# WER on train_dev(tg)      13.58     13.50     13.37
+# WER on train_dev(fg)      12.43     12.44     12.47
+# WER on eval2000(tg)        16.0      16.0      15.8
+# WER on eval2000(fg)        14.3      14.3      14.3
+# WER on rt03(tg)            15.2      15.4      15.1
+# WER on rt03(fg)            13.0      13.0      12.7
+# Final train prob         -0.109    -0.111    -0.099
+# Final valid prob         -0.117    -0.119    -0.110
+# Final train prob (xent)        -1.278    -1.291    -1.302
+# Final valid prob (xent)       -1.2880   -1.3036   -1.3184
+# Num-parameters               16089380  14216996  14216996
+
+# 7m15 is as 7m12 but reducing the bottleneck dim at the output from
+#   384 to 256 (like 11->14).
+# 7m12 is as 7m11 but increasing all the TDNN dims from 1024 to 1280.
+# Seems a little better but could be due to the increase in parameters.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# System                tdnn7m8_sp tdnn7m9_sp tdnn7m10_sp tdnn7m11_sp tdnn7m12_sp
+# WER on train_dev(tg)      13.60     13.88     13.77     13.83     13.58
+# WER on train_dev(fg)      12.62     12.64     12.65     12.65     12.43
+# WER on eval2000(tg)        16.8      16.1      16.1      16.1      16.0
+# WER on eval2000(fg)        15.4      14.4      14.3      14.5      14.3
+# WER on rt03(tg)            16.2      15.5      15.6      15.3      15.2
+# WER on rt03(fg)            13.7      13.1      13.2      13.0      13.0
+# Final train prob         -0.105    -0.111    -0.111    -0.109    -0.109
+# Final valid prob         -0.115    -0.119    -0.120    -0.118    -0.117
+# Final train prob (xent)        -1.282    -1.309    -1.314    -1.292    -1.278
+# Final valid prob (xent)       -1.3194   -1.3246   -1.3247   -1.3077   -1.2880
+# Num-parameters               11580452  13818148  13361700  13809188  16089380
+
+# 7m11 is as 7m10 but increasing the TDNN dims and reducing the bottlenecks.
+# 7m10 is as 7m9 but reducing the bottleneck-dims for the non-splicing TDNN layers.
+# 7m9 is as 7m8 but adding bottleneck-dims, and increasing the TDNN dims.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn7m8_sp tdnn7m9_sp
+# System                tdnn7m8_sp tdnn7m9_sp
+# WER on train_dev(tg)      13.60     13.88
+# WER on train_dev(fg)      12.62     12.64
+# WER on eval2000(tg)        16.8      16.1
+# WER on eval2000(fg)        15.4      14.4
+# WER on rt03(tg)            16.2      15.5
+# WER on rt03(fg)            13.7      13.1
+# Final train prob         -0.105    -0.111
+# Final valid prob         -0.115    -0.119
+# Final train prob (xent)        -1.282    -1.309
+# Final valid prob (xent)       -1.3194   -1.3246
+# Num-parameters               11580452  13818148
+
+# 7m8 is as 7m5b but double the l2-regularization for the TDNN layers, which
+# is the same as 7m2->7m3, which was helpful there.
+#  Does seem helpful.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5b_sp tdnn7m8_sp
+# WER on train_dev(tg)      13.70     13.74     13.81     13.60
+# WER on train_dev(fg)      12.67     12.76     12.74     12.62
+# WER on eval2000(tg)        16.6      17.1      17.0      16.8
+# WER on eval2000(fg)        15.1      15.4      15.4      15.4
+# WER on rt03(tg)            16.1      16.2      16.0      16.2
+# WER on rt03(fg)            13.7      13.8      13.6      13.7
+# Final train prob         -0.085    -0.106    -0.104    -0.105
+# Final valid prob         -0.103    -0.118    -0.116    -0.115
+# Final train prob (xent)        -1.230    -1.296    -1.285    -1.282
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3283   -1.3194
+# Num-parameters               16292693  10924836  11580452  11580452
+
+
+# 7m5b is as 7m5 but rducing the prefinal layer dims to previous values.
+# WER changes (+ is worse): +1 +1 +2 +3 -2 -2...  so maybe worse on average,
+#  but not clear at all... for consistency with other setups I may retain
+#  this change.
+
+# local/chain/compare_wer_general.sh --rt03 tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# System                tdnn_7m_sp tdnn_7m2_sp tdnn7m5_sp tdnn7m5b_sp
+# WER on train_dev(tg)      13.70     13.74     13.71     13.81
+# WER on train_dev(fg)      12.67     12.76     12.64     12.74
+# WER on eval2000(tg)        16.6      17.1      16.8      17.0
+# WER on eval2000(fg)        15.1      15.4      15.1      15.4
+# WER on rt03(tg)            16.1      16.2      16.2      16.0
+# WER on rt03(fg)            13.7      13.8      13.8      13.6
+# Final train prob         -0.085    -0.106    -0.103    -0.104
+# Final valid prob         -0.103    -0.118    -0.114    -0.116
+# Final train prob (xent)        -1.230    -1.296    -1.274    -1.285
+# Final valid prob (xent)       -1.2704   -1.3318   -1.3016   -1.3283
+# Num-parameters               16292693  10924836  12170788  11580452
+
+
+# 7m5 is as 7m2 but increasing the dimension of the last TDNN layer
+#  and the prefinal layers from 512 to 768.
+# 7m2 is as 7m but with a bunch of tuning changes (model is smaller).
+# 7m is as 7k but adding two non-splicing layers towards the beginning of the
+#   network.
+# The impovement is pretty small but I've seen similar improvements on other
+# setups with this architecture so I tend to believe it.
+
+
+# local/chain/compare_wer_general.sh tdnn_7k_sp tdnn_7m_sp
+# System                tdnn_7k_sp tdnn_7m_sp
+# WER on train_dev(tg)      13.83     13.65
+# WER on train_dev(fg)      12.74     12.54
+# WER on eval2000(tg)        16.9      16.8
+# WER on eval2000(fg)        15.2      15.1
+# Final train prob         -0.085    -0.084
+# Final valid prob         -0.107    -0.103
+# Final train prob (xent)        -1.267    -1.215
+# Final valid prob (xent)       -1.3107   -1.2735
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn_7m_sp
+# exp/chain/tdnn_7m_sp: num-iters=262 nj=3..16 num-params=16.3M dim=40+100->6034 combine=-0.103->-0.103 xent:train/valid[173,261,final]=(-1.28,-1.21,-1.21/-1.32,-1.27,-1.27) logprob:train/valid[173,261,final]=(-0.093,-0.084,-0.084/-0.109,-0.104,-0.103)
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+affix=7m25l
+suffix=
+$speed_perturb && suffix=_sp
+if [ -e data/rt03 ]; then maybe_rt03=rt03; else maybe_rt03= ; fi
+
+dir=exp/chain/tdnn${affix}${suffix}
+decode_iter=
+decode_nj=50
+
+# training options
+frames_per_eg=150,110,100
+remove_egs=false
+common_egs_dir=
+xent_regularize=0.1
+dropout_schedule='0,0@0.20,0.5@0.50,0'
+
+test_online_decoding=false  # if true, it will run the last decoding stage.
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_7d_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the LF-MMI training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology. This is the critically different
+  # step compared with other recipes.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --context-opts "--context-width=2 --central-position=1" \
+      --cmd "$train_cmd" 7000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $treedir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.002 dropout-proportion=0.0 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.0005"
+
+  mkdir -p $dir/configs
+
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=1280
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=1280
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=1280
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=1536 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=1536
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=1536
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1536
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=1536
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1536
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1536
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+if [ $stage -le 13 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage $train_stage \
+    --cmd "queue.pl --config /home/dpovey/queue_conly.conf" \
+    --feat.online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --feat.cmvn-opts "--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize 0.0 \
+    --chain.apply-deriv-weights false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--frames-overlap-per-eg 0" \
+    --egs.chunk-width $frames_per_eg \
+    --trainer.num-chunk-per-minibatch 128 \
+    --trainer.frames-per-iter 1500000 \
+    --trainer.num-epochs 8 \
+    --trainer.optimization.num-jobs-initial 3 \
+    --trainer.optimization.num-jobs-final 16 \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs $remove_egs \
+    --feat-dir data/${train_set}_hires \
+    --tree-dir $treedir \
+    --lat-dir exp/tri4_lats_nodup$suffix \
+    --dir $dir  || exit 1;
+
+fi
+
+if [ $stage -le 14 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+
+graph_dir=$dir/graph_sw1_tg
+iter_opts=
+if [ ! -z $decode_iter ]; then
+  iter_opts=" --iter $decode_iter "
+fi
+if [ $stage -le 15 ]; then
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+          --nj $decode_nj --cmd "$decode_cmd" $iter_opts \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+          $graph_dir data/${decode_set}_hires \
+          $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+if $test_online_decoding && [ $stage -le 16 ]; then
+  # note: if the features change (e.g. you add pitch features), you will have to
+  # change the options of the following command line.
+  steps/online/nnet3/prepare_online_decoding.sh \
+       --mfcc-config conf/mfcc_hires.conf \
+       $lang exp/nnet3/extractor $dir ${dir}_online
+
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in train_dev eval2000 $maybe_rt03; do
+    (
+      # note: we just give it "$decode_set" as it only uses the wav.scp, the
+      # feature type does not matter.
+
+      steps/online/nnet3/decode.sh --nj $decode_nj --cmd "$decode_cmd" \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+         $graph_dir data/${decode_set}_hires \
+         ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_tg || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            ${dir}_online/decode_${decode_set}${decode_iter:+_$decode_iter}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+    ) || touch $dir/.error &
+  done
+  wait
+  if [ -f $dir/.error ]; then
+    echo "$0: something went wrong in decoding"
+    exit 1
+  fi
+fi
+
+
+exit 0;
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
index be8d39de80b..e3d13ac1f65 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1f.sh
@@ -4,7 +4,7 @@
 # end, and no chain l2-regularize
 #[note: was 1e12e.]
 
-# local/chain/compare_wer.sh exp/chain/tdnn1e10_sp exp/chain/tdnn1e12e_sp
+# local/chain/compare_wer.sh exp/chain/tdnn1e_sp exp/chain/tdnn1f_sp
 # System                tdnn1e10_sp tdnn1e12e_sp
 #WER dev93 (tgpr)                7.29      7.20
 #WER dev93 (tg)                  7.08      6.81
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..eda1461a2ab 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,9 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False, # if you set this, it's
+                                                    # like dropout-per-dim but with a
+                                                    # continuous-valued (not zero-one) mask.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -864,32 +867,19 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                if not self.config['dropout-per-dim']:
+                if not (self.config['dropout-per-dim'] or
+                        self.config['dropout-per-dim-continuous']):
                     line = ('component name={0}.{1} type=DropoutComponent '
                             'dim={2} dropout-proportion={3}'.format(
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
-                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
-                    configs.append(line)
-                    # note: the input to the dropout_mask component is never used, it's
-                    # just syntactically required.
-                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
-                            'input={1}'.format(self.name, cur_node))
-                    configs.append(line)
-                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
-                            'input-dim={1} output-dim={2} '.format(
-                                self.name, 2 * output_dim, output_dim))
-                    configs.append(line)
-                    line = ('component-node name={0}.dropout component={0}.dropout '
-                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
-                            ''.format(self.name, cur_node))
-                    configs.append(line)
-                    cur_node = '{0}.dropout'.format(self.name)
-                    continue
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
 
+                    line = ('component name={0}.dropout type=GeneralDropoutComponent '
+                            'dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"
                                    .format(nonlinearity))
diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py
index 87edd661a6f..6b8b1834749 100755
--- a/egs/wsj/s5/steps/nnet3/chain/train.py
+++ b/egs/wsj/s5/steps/nnet3/chain/train.py
@@ -221,7 +221,8 @@ def process_args(args):
     if (not os.path.exists(args.dir+"/configs") and
         (args.input_model is None or not os.path.exists(args.input_model))):
         raise Exception("Either --trainer.input-model option should be supplied, "
-                        "and exist; or the {0}/configs directory should exist.")
+                        "and exist; or the {0}/configs directory should exist."
+                        "".format(args.dir))
 
     if args.transform_dir is None:
         args.transform_dir = args.lat_dir
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 620ea873eb7..c936061de26 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -33,10 +33,9 @@ DenominatorComputation::DenominatorComputation(
     den_graph_(den_graph),
     num_sequences_(num_sequences),
     frames_per_sequence_(nnet_output.NumRows() / num_sequences_),
-    exp_nnet_output_transposed_(nnet_output, kTrans),
     nnet_output_deriv_transposed_(
-        exp_nnet_output_transposed_.NumRows(),
-        std::min<int32>(exp_nnet_output_transposed_.NumCols(),
+        nnet_output.NumCols(),
+        std::min<int32>(nnet_output.NumRows(),
                         static_cast<int32>(kMaxDerivTimeSteps) *
                         num_sequences_)),
     alpha_(frames_per_sequence_ + 1,
@@ -57,6 +56,14 @@ DenominatorComputation::DenominatorComputation(
                  num_sequences_).SetZero();
 
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
+  // the kStrideEqualNumCols argument means we'll allocate a contiguous block of
+  // memory for this; it is added to ensure that the same block of memory
+  // (cached in the allocator) can be used for xent_output_deriv when allocated
+  // from chain-training.cc.
+  exp_nnet_output_transposed_.Resize(nnet_output.NumCols(),
+                                     nnet_output.NumRows(),
+                                     kUndefined, kStrideEqualNumCols);
+  exp_nnet_output_transposed_.CopyFromMat(nnet_output, kTrans);
   exp_nnet_output_transposed_.ApplyExp();
 }
 
diff --git a/src/chain/chain-training.cc b/src/chain/chain-training.cc
index bf61bed67f0..f4b0d110373 100644
--- a/src/chain/chain-training.cc
+++ b/src/chain/chain-training.cc
@@ -52,8 +52,15 @@ void ComputeChainObjfAndDeriv(const ChainTrainingOptions &opts,
                                 nnet_output_deriv);
   }
 
-  if (xent_output_deriv != NULL)
-    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols());
+  if (xent_output_deriv != NULL) {
+    // the reason for kStrideEqualNumCols is so that we can share the memory
+    // block with the memory that was used for exp_nnet_output_transposed_ from
+    // chain-denominator.cc, which has just been freed; it also uses the
+    // kStrideEqualNumCols arg (its shape is the transpose of this matrix's
+    // shape).
+    xent_output_deriv->Resize(nnet_output.NumRows(), nnet_output.NumCols(),
+                              kSetZero, kStrideEqualNumCols);
+  }
 
 
   {
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 8ab03c7e14e..f2926ddc2f1 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -143,6 +143,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
 void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
                     const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
                     int src_stride);
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride);
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride);
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim);
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ae7e25b716d..50dd3d1d0ca 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -1984,6 +1984,23 @@ static void _add_rows(Real alpha, Real* dst, const Real *src,
   }
 }
 
+template<typename Real>
+__global__
+static void _mul_rows(Real* dst, const Real *src,
+                      const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                      int src_stride) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // col index
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // row index
+  if (i < dst_dim.cols && j < dst_dim.rows) {
+    int dst_index = j * dst_dim.stride + i;
+    if (reorder[j] >= 0) {
+      int src_index = reorder[j] * src_stride + i;
+      dst[dst_index] *= src[src_index];
+    }
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _add_rows(Real alpha, Real* dst, const Real * const *src,
@@ -3764,6 +3781,12 @@ void cudaF_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst, const float* src,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaF_mul_rows(dim3 Gr, dim3 Bl, float* dst, const float* src,
+                    const MatrixIndexT_cuda* reorder, MatrixDim dst_dim,
+                    int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaF_add_rows_direct(dim3 Gr, dim3 Bl, float alpha, float* dst,
                            const float* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
@@ -4454,6 +4477,12 @@ void cudaD_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, reorder, dst_dim, src_stride);
 }
 
+void cudaD_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                    const double* src, const MatrixIndexT_cuda* reorder,
+                    MatrixDim dst_dim, int src_stride) {
+  _mul_rows<<<Gr,Bl>>>(dst, src, reorder, dst_dim, src_stride);
+}
+
 void cudaD_add_rows_direct(dim3 Gr, dim3 Bl, double alpha, double* dst,
                            const double* const * src, MatrixDim dst_dim) {
   _add_rows<<<Gr,Bl>>>(alpha, dst, src, dst_dim);
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 3518e0c71ed..fe706815a44 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -221,20 +221,30 @@ inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* const * src, MatrixDim dst_dim) {
   cudaD_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
 }
+inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
+                          const float* const * src, MatrixDim dst_dim) {
+  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
+}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, double alpha, double* dst,
                           const double* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaD_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
-inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
-                          const float* const * src, MatrixDim dst_dim) {
-  cudaF_add_rows_direct(Gr, Bl, alpha, dst, src, dst_dim);
-}
 inline void cuda_add_rows(dim3 Gr, dim3 Bl, float alpha, float* dst,
                           const float* src, const MatrixIndexT_cuda* reorder,
                           MatrixDim dst_dim, int src_stride) {
   cudaF_add_rows(Gr, Bl, alpha, dst, src, reorder, dst_dim, src_stride);
 }
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, double* dst,
+                          const double* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaD_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
+inline void cuda_mul_rows(dim3 Gr, dim3 Bl, float* dst,
+                          const float* src, const MatrixIndexT_cuda* reorder,
+                          MatrixDim dst_dim, int src_stride) {
+  cudaF_mul_rows(Gr, Bl, dst, src, reorder, dst_dim, src_stride);
+}
 inline void cuda_add_smat(dim3 Gr, dim3 Bl, double* mat, MatrixDim mat_dim,
                           double alpha, const int* smat_row_ptr,
                           const int* smat_col_idx, const double* smat_val) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 909e5552a35..33db8b3e625 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -534,6 +534,42 @@ static void UnitTestCuMatrixAddRows() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixMulRows() {
+  for (int32 p = 0; p < 2; p++) {
+    MatrixIndexT num_rows1 = 10 + Rand() % 10,
+        num_rows2 = 10 + Rand() % 10,
+        num_cols = 10 + Rand() % 10;
+    CuMatrix<Real> M(num_rows1, num_cols);
+    M.SetRandn();
+
+    CuMatrix<Real> N1(num_rows2, num_cols),
+        O(num_rows2, num_cols);
+    std::vector<int32> reorder(num_rows2);
+    std::vector<const Real*> reorder_src(num_rows2, NULL);
+    for (int32 i = 0; i < num_rows2; i++) {
+      reorder[i] = -1 + (Rand() % (num_rows1 + 1));
+      if (reorder[i] != -1)
+        reorder_src[i] = M.RowData(reorder[i]);
+    }
+
+    CuArray<int32> reorder_cuda(reorder);
+    N1.MulRows(M, reorder_cuda);
+
+    for (int32 i = 0; i < num_rows2; i++) {
+      if (reorder[i] != -1) {
+        CuSubVector<Real> O_row(O, i),
+            M_row(M, reorder[i]);
+        O_row.MulElements(M_row);
+      }
+    }
+
+    AssertEqual(N1, O);
+  }
+}
+
+
+
 template<typename Real>
 static void UnitTestCuMatrixAddToRows() {
   for (int32 p = 0; p < 2; p++) {
@@ -2914,6 +2950,7 @@ template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixCopyColsFromVec<Real>();
   UnitTestCuMatrixCopyToRows<Real>();
   UnitTestCuMatrixAddRows<Real>();
+  UnitTestCuMatrixMulRows<Real>();
   UnitTestCuMatrixAddToRows<Real>();
   UnitTestCuMatrixAddRowRanges<Real>();
   UnitTestCuMatrixAddTpMat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 813c5e75d14..34290561cc5 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2722,6 +2722,41 @@ void CuMatrixBase<Real>::AddRows(Real alpha,
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::MulRows(const CuMatrixBase<Real> &src,
+                                 const CuArrayBase<MatrixIndexT> &indexes) {
+  if (NumRows() == 0) return;
+  KALDI_ASSERT(static_cast<MatrixIndexT>(indexes.Dim()) == NumRows());
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    KALDI_ASSERT(src.NumCols() == NumCols());
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_mul_rows(dimGrid, dimBlock,
+                  data_, src.Data(), indexes.Data(), Dim(), src.Stride());
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    MatrixBase<Real> &this_mat(Mat());
+    const MatrixBase<Real> &src_mat(src.Mat());
+    int32 num_rows = NumRows();
+    const MatrixIndexT *index_ptr = indexes.Data();
+    for (int32 r = 0; r < num_rows; r++) {
+      int32 src_r = index_ptr[r];
+      if (src_r < 0)
+        continue;
+      SubVector<Real> this_row(this_mat, r),
+          src_row(src_mat, src_r);
+      this_row.MulElements(src_row);
+    }
+  }
+}
+
+
 
 template<typename Real>
 void CuMatrixBase<Real>::AddRows(Real alpha, const CuArrayBase<const Real*> &src) {
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 7c3a2a2e11f..86c50cfc485 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -139,6 +139,15 @@ class CuMatrixBase {
                const CuMatrixBase<Real> &src,
                const CuArrayBase<MatrixIndexT> &indexes);
 
+
+  /// Does for each row r, this.Row(r) *= alpha * src.row(indexes[r]),
+  /// where '*=' is elementwise multiplication.
+  /// If indexes[r] < 0, does not add anything.
+  /// src.NumCols() must equal this.NumCols()
+  void MulRows(const CuMatrixBase<Real> &src,
+               const CuArrayBase<MatrixIndexT> &indexes);
+
+
   /// Does for each row r, this.Row(r) += alpha * src[r],
   /// treating src[r] as the beginning of a region of memory representing
   /// a vector of floats, of the same length as this.NumCols().
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 2080c60077b..844fb82d32a 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -92,18 +92,101 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
   num_minibatches_processed_++;
 }
 
+// This object exists to help avoid memory fragmentation: it allocates,
+// but does not use, the exact sizes of memory that are going to be needed
+// in ComputeChainObjfAndDeriv().
+class ChainTrainerMemoryHolder {
+ public:
+  ChainTrainerMemoryHolder(const Nnet &nnet,
+                           int32 num_den_graph_states,
+                           const NnetChainExample &eg);
+ private:
+  CuMatrix<BaseFloat> nnet_output_deriv_;
+  CuMatrix<BaseFloat> xent_output_deriv_;
+  CuMatrix<BaseFloat> beta_;
+  CuMatrix<BaseFloat> alpha_;
+
+};
+
+ChainTrainerMemoryHolder::ChainTrainerMemoryHolder(const Nnet &nnet,
+                                                   int32 den_graph_states,
+                                                   const NnetChainExample &eg) {
+
+  std::vector<NnetChainSupervision>::const_iterator iter = eg.outputs.begin(),
+      end = eg.outputs.end();
+
+  int32 max_rows = 0,
+      max_cols = 0;
+
+  size_t max_frames_per_sequence = 0,
+         max_sequence_size = 0,
+         max_alpha_matrix_size = 0;
+
+  for (; iter != end; ++iter) {
+    // there will normally be just one of these things; we'll normally loop once.
+    const NnetChainSupervision &sup = *iter;
+
+    int32 output_rows = sup.supervision.num_sequences * sup.supervision.frames_per_sequence;
+    int32 output_cols = nnet.OutputDim("output");
+
+    size_t curr_frames_per_sequence = output_rows / sup.supervision.num_sequences + 1;
+    size_t den_graph_size = den_graph_states + 1;
+    size_t curr_sequence_size = den_graph_size * sup.supervision.num_sequences;
+    size_t curr_alpha_matrix_size = curr_frames_per_sequence * curr_sequence_size;
+
+    if (curr_alpha_matrix_size > max_alpha_matrix_size) {
+      max_alpha_matrix_size = curr_alpha_matrix_size;
+      max_frames_per_sequence = curr_frames_per_sequence;
+      max_sequence_size = curr_sequence_size;
+    }
+
+    size_t matrix_size = output_rows * output_cols;
+    if (matrix_size > (max_rows * max_cols)) {
+      max_rows = output_rows;
+      max_cols = output_cols;
+    }
+  }
+
+  // the sequence of resizes is in a specific order (bigger to smaller)
+  // so that the cudaMalloc won't trash the memory it has already
+  // alloc'd in the previous iterations
+  alpha_.Resize(max_frames_per_sequence,
+                max_sequence_size,
+                kUndefined);
+
+
+  nnet_output_deriv_.Resize(max_rows, max_cols, kUndefined);
+  // note: the same block of memory can be used for xent_output_deriv_ as is
+  // used for exp_nnet_output_transposed_ in chain-training.cc.
+  xent_output_deriv_.Resize(max_rows, max_cols,
+                            kUndefined, kStrideEqualNumCols);
+
+  beta_.Resize(2, max_sequence_size, kUndefined);
+}
+
 void NnetChainTrainer::TrainInternal(const NnetChainExample &eg,
                                      const NnetComputation &computation) {
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
+
+  // reserve the memory needed in ProcessOutputs (before memory gets fragmented
+  // by the call to computer.Run().
+  ChainTrainerMemoryHolder *memory_holder =
+      new ChainTrainerMemoryHolder(*nnet_, den_graph_.NumStates(), eg);
+
   // give the inputs to the computer object.
   computer.AcceptInputs(*nnet_, eg.inputs);
   computer.Run();
 
+  // 'this->ProcessOutputs()' is going to need the same sizes as are stored in
+  // 'memory_holder'.
+  delete memory_holder;
+
+  // Probably could be merged in a single call PreallocateChainTrainerMemory(*nnet_, eg) ?
   this->ProcessOutputs(false, eg, &computer);
   computer.Run();
 
@@ -140,7 +223,7 @@ void NnetChainTrainer::TrainInternalBackstitch(const NnetChainExample &eg,
   const NnetTrainerOptions &nnet_config = opts_.nnet_config;
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(nnet_config.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index ce4bbd0940a..c73f3fb921d 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -66,6 +66,8 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new TimeHeightConvolutionComponent::PrecomputedIndexes();
   } else if (cpi_type == "RestrictedAttentionComponentPrecomputedIndexes") {
     ans = new RestrictedAttentionComponent::PrecomputedIndexes();
+  } else if (cpi_type == "GeneralDropoutComponentPrecomputedIndexes") {
+    ans = new GeneralDropoutComponentPrecomputedIndexes();
   }
   if (ans != NULL) {
     KALDI_ASSERT(cpi_type == ans->Type());
@@ -158,6 +160,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) {
     ans = new DropoutComponent();
   } else if (component_type == "DropoutMaskComponent") {
     ans = new DropoutMaskComponent();
+  } else if (component_type == "GeneralDropoutComponent") {
+    ans = new GeneralDropoutComponent();
   } else if (component_type == "BackpropTruncationComponent") {
     ans = new BackpropTruncationComponent();
   } else if (component_type == "LstmNonlinearityComponent") {
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index c34d550d681..79a1f1a5602 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -588,7 +588,7 @@ class UpdatableComponent: public Component {
                      self-repair mechanism is activated.  -1000 is a special value which
                      will cause a component-specific default to be used.
 
-       block-dim     Defaults to dim, but may be any nonzero divisor of dim.  It affects the
+       block-dim     Defaults to dim, but may be any divisor of dim.  It affects the
                      self-repair, which will be done while treating the input/output as
                      repeating blocks of size 'block-dim' (e.g. blocks of filters).  It allows
                      us to do self-repair on the filter level in CNNs.
diff --git a/src/nnet3/nnet-component-test.cc b/src/nnet3/nnet-component-test.cc
index d7595378c1f..39bd156e360 100644
--- a/src/nnet3/nnet-component-test.cc
+++ b/src/nnet3/nnet-component-test.cc
@@ -41,7 +41,7 @@ bool CheckStringsApproxEqual(const std::string &a,
                              int32 tolerance = 3) {
   if (!StringsApproxEqual(a, b, tolerance)) {
     KALDI_WARN << "Strings differ: " << a
-               << "\vs.\n" << b;
+               << "\nvs.\n" << b;
     return false;
   } else {
     return true;
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index 98aed592a62..bb0e7c917fc 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -636,8 +636,8 @@ static void PrintCommand(std::ostream &os_out,
         KALDI_ASSERT(c.arg2 == kCompressedMatrixInt16);
         compressed_matrix_type = "uint16";
       }
-      os << "CompressMatrix(" << submatrix_strings[c.arg1]
-         << ", " << range << ", " << compressed_matrix_type << ", "
+      os << "CompressMatrix(" << submatrix_strings[c.arg1] << ", "
+         << range << ", " << compressed_matrix_type << ", "
          << truncate << ")\n";
       break;
     }
diff --git a/src/nnet3/nnet-compute.cc b/src/nnet3/nnet-compute.cc
index 19eecdda72b..cae6f41f5f2 100644
--- a/src/nnet3/nnet-compute.cc
+++ b/src/nnet3/nnet-compute.cc
@@ -399,8 +399,8 @@ void NnetComputer::ExecuteCommand() {
           compressed_matrices_[m]->CopyFromMat(matrices_[m]);
           matrices_[m].Resize(0, 0);
         }
-        break;
 #endif
+        break;
       case kDecompressMatrix:
 #if HAVE_CUDA == 1
         if (CuDevice::Instantiate().Enabled()) {
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..2720fbbd0bd 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,11 +1507,280 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
 
 
+std::string GeneralDropoutComponent::Info() const {
+  std::ostringstream stream;
+  stream << Type()
+         << ", dim=" << dim_
+         << ", block-dim=" << block_dim_
+         << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
+  if (time_period_ > 0)
+    stream << ", time-period=" << time_period_;
+  return stream.str();
+}
+
+GeneralDropoutComponent::GeneralDropoutComponent():
+    dim_(-1), block_dim_(-1), time_period_(0),
+    dropout_proportion_(0.5), continuous_(false) { }
+
+GeneralDropoutComponent::GeneralDropoutComponent(
+    const GeneralDropoutComponent &other):
+    dim_(other.dim_),
+    block_dim_(other.block_dim_),
+    time_period_(other.time_period_),
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
+
+void* GeneralDropoutComponent::Propagate(
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &in,
+    CuMatrixBase<BaseFloat> *out) const {
+
+  KALDI_ASSERT(SameDim(in, *out));
+
+  // The following will do nothing if 'out' and 'in' refer to the same data.
+  out->CopyFromMat(in);
+
+  if (test_mode_ || dropout_proportion_ == 0.0)
+    return NULL;
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+    dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL);
+
+  CuMatrix<BaseFloat> *mask = GetMemo(indexes->num_mask_rows);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(out->Stride() == out->NumCols());
+    int32 num_rows = out->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> out_reshaped(out->Data(), block_dim_,
+                                        num_rows_reshaped,
+                                        num_rows_reshaped);
+    out_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    out->MulRows(*mask, indexes->indexes);
+  }
+  return mask;
+}
+
+void GeneralDropoutComponent::Backprop(
+    const std::string &debug_info,
+    const ComponentPrecomputedIndexes *indexes_in,
+    const CuMatrixBase<BaseFloat> &, // in_value
+    const CuMatrixBase<BaseFloat> &, // out_value
+    const CuMatrixBase<BaseFloat> &out_deriv,
+    void *memo,
+    Component *to_update,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  KALDI_ASSERT(in_deriv != NULL && SameDim(*in_deriv, out_deriv));
+
+  // The following will do no work if in_deriv->Data() == out_deriv.Data().
+  in_deriv->CopyFromMat(out_deriv);
+
+  if (test_mode_ || dropout_proportion_ == 0.0) {
+    KALDI_ASSERT(memo == NULL);
+    return;
+  }
+
+  const GeneralDropoutComponentPrecomputedIndexes *indexes =
+     dynamic_cast<const GeneralDropoutComponentPrecomputedIndexes*>(indexes_in);
+  KALDI_ASSERT(indexes != NULL && memo != NULL);
+  CuMatrix<BaseFloat> *mask = reinterpret_cast<CuMatrix<BaseFloat>*>(memo);
+
+  if (block_dim_ < dim_) {
+    KALDI_ASSERT(in_deriv->Stride() == in_deriv->NumCols());
+    int32 num_rows = in_deriv->NumRows(),
+        dim_multiple = dim_  / block_dim_,
+        num_rows_reshaped = num_rows * dim_multiple;
+    CuSubMatrix<BaseFloat> in_deriv_reshaped(in_deriv->Data(), block_dim_,
+                                             num_rows_reshaped,
+                                             num_rows_reshaped);
+    in_deriv_reshaped.MulRows(*mask, indexes->indexes);
+  } else {
+    in_deriv->MulRows(*mask, indexes->indexes);
+  }
+}
+
+void GeneralDropoutComponent::Read(std::istream &is, bool binary) {
+  ExpectOneOrTwoTokens(is, binary, "<GeneralDropoutComponent>", "<Dim>");
+  ReadBasicType(is, binary, &dim_);
+  ExpectToken(is, binary, "<BlockDim>");
+  ReadBasicType(is, binary, &block_dim_);
+  ExpectToken(is, binary, "<TimePeriod>");
+  ReadBasicType(is, binary, &time_period_);
+  ExpectToken(is, binary, "<DropoutProportion>");
+  ReadBasicType(is, binary, &dropout_proportion_);
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
+    test_mode_ = true;
+  } else {
+    test_mode_ = false;
+  }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</GeneralDropoutComponent>");
+}
+
+
+void GeneralDropoutComponent::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<GeneralDropoutComponent>");
+  WriteToken(os, binary, "<Dim>");
+  WriteBasicType(os, binary, dim_);
+  WriteToken(os, binary, "<BlockDim>");
+  WriteBasicType(os, binary, block_dim_);
+  WriteToken(os, binary, "<TimePeriod>");
+  WriteBasicType(os, binary, time_period_);
+  WriteToken(os, binary, "<DropoutProportion>");
+  WriteBasicType(os, binary, dropout_proportion_);
+  if (test_mode_)
+    WriteToken(os, binary, "<TestMode>");
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
+  WriteToken(os, binary, "</GeneralDropoutComponent>");
+}
+
+Component* GeneralDropoutComponent::Copy() const {
+  return new GeneralDropoutComponent(*this);
+}
+
+void GeneralDropoutComponent::InitFromConfig(ConfigLine *cfl) {
+  dim_ = 0;
+  bool ok = cfl->GetValue("dim", &dim_);
+  KALDI_ASSERT(ok && dim_ > 0);
+  block_dim_ = dim_;
+  cfl->GetValue("block-dim", &block_dim_);
+  if (!(block_dim_ > 0 && dim_ % block_dim_ == 0))
+    KALDI_ERR << "Invalid configuration dim=" << dim_
+              << ", block-dim=" << block_dim_;
+  time_period_ = 0;
+  cfl->GetValue("time-period", &time_period_);
+  dropout_proportion_ = 0.5;
+  cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
+  test_mode_ = false;
+  cfl->GetValue("test-mode", &test_mode_);
+}
+
+
+CuMatrix<BaseFloat>* GeneralDropoutComponent::GetMemo(
+    int32 num_mask_rows) const {
+  KALDI_ASSERT(num_mask_rows > 0 && !test_mode_ &&
+               dropout_proportion_ > 0.0);
+  CuMatrix<BaseFloat> *ans = new CuMatrix<BaseFloat>(num_mask_rows, block_dim_);
+  BaseFloat dropout_proportion = dropout_proportion_;
+
+  // This const_cast is only safe assuming you don't attempt
+  // to use multi-threaded code with the GPU.
+  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(ans);
+
+  if (!continuous_) {
+    ans->Add(-dropout_proportion);
+    // now, a proportion "dropout_proportion" will be < 0.0. After applying the
+    // function (x>0?1:0), a proportion "dropout_proportion" will be zero and (1 -
+    // dropout_proportion) will be 1.0.
+    ans->ApplyHeaviside();
+    ans->Scale(1.0 / dropout_proportion);
+  } else {
+    ans->Scale(dropout_proportion * 4.0);
+    // make the expected value 1.0.
+    ans->Add(1.0 - (2.0 * dropout_proportion));
+  }
+  return ans;
+}
+
+ComponentPrecomputedIndexes* GeneralDropoutComponent::PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const {
+  KALDI_ASSERT(input_indexes == output_indexes);
+
+  GeneralDropoutComponentPrecomputedIndexes *ans = new
+      GeneralDropoutComponentPrecomputedIndexes;
+  int32 size = input_indexes.size(), time_period = time_period_,
+      cur_row = 0;
+  std::vector<int32> indexes(size);
+  // the map 'm' will map from a pair from (n, t) value to the row-index of the
+  // dropout-mask matrix*.   However, the 't' isn't a real 't' value;
+  // if time_period_ == 0, the 't' value will just be zero; otherwise,
+  // it will be t divided by time_period_ (rounding towards negative infinity).
+
+  // *before considering effects related to when block_dim_ != dim_.
+
+  std::unordered_map<std::pair<int32,int32>, int32, PairHasher<int32> > m;
+  for (int32 i = 0; i < size; i++) {
+    int32 n = input_indexes[i].n,
+        t = (time_period == 0 ? 0 : DivideRoundingDown(input_indexes[i].t,
+                                                       time_period));
+    std::pair<int32, int32> p(n, t);
+
+    std::unordered_map<std::pair<int32,int32>, int32,
+                       PairHasher<int32> >::const_iterator
+        iter = m.find(p);
+    if (iter != m.end()) {
+      indexes[i] = iter->second;
+    } else {
+      m[p] = cur_row;
+      indexes[i] = cur_row;
+      cur_row++;
+    }
+  }
+  int32 multiple = dim_ / block_dim_;
+  ans->num_mask_rows = cur_row;
+  if (multiple == 1) {
+    ans->indexes.CopyFromVec(indexes);
+  } else {
+    ans->num_mask_rows = cur_row * multiple;
+    std::vector<int32> repeated_indexes;
+    repeated_indexes.reserve(size * multiple);
+    for (int32 i = 0; i < size; i++) {
+      int32 row = indexes[i];
+      for (int32 j = 0; j < multiple; j++)
+        repeated_indexes.push_back(row);
+    }
+    ans->indexes.CopyFromVec(repeated_indexes);
+  }
+  return ans;
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Write(std::ostream &os,
+    bool binary) const {
+  WriteToken(os, binary,
+             "<GeneralDropoutComponentPrecomputedIndexes>");
+  WriteToken(os, binary, "<NumMaskRows>");
+  WriteBasicType(os, binary, num_mask_rows);
+  WriteToken(os, binary, "<Indexes>");
+  indexes.Write(os, binary);
+  WriteToken(os, binary,
+             "</GeneralDropoutComponentPrecomputedIndexes>");
+}
+
+void GeneralDropoutComponentPrecomputedIndexes::Read(std::istream &is,
+    bool binary) {
+  ExpectOneOrTwoTokens(is, binary,
+                       "<GeneralDropoutComponentPrecomputedIndexes>",
+                       "<NumMaskRows>");
+  ReadBasicType(is, binary, &num_mask_rows);
+  ExpectToken(is, binary, "<Indexes>");
+  indexes.Read(is, binary);
+  ExpectToken(is, binary,
+              "</GeneralDropoutComponentPrecomputedIndexes>");
+}
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..cff73a55b59 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,12 +776,182 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
 
 
 
+/**
+   GeneralDropoutComponent implements dropout, including a continuous
+   variant where the thing we multiply is not just zero or one, but may
+   be a continuous value.  It is intended for the case where you want to
+   either share the dropout mask across all of time, or across groups
+   of 't' values (e.g. the first block of 10 values gets one dropout
+   mask, the second block of 10 gets another one, and so on).
+
+
+   Configuration values accepted on the command line, with defaults:
+
+       dim        Dimension of the input and output of this component,
+                  e.g. 512
+
+       block-dim  Block size if you want the dropout mask to repeat,
+                  e.g. if dim=512 and you sent block-dim=128, there will
+                  be a mask of dimension 128 repeated 4 times.  This can
+                  be useful in convolutional setups.  If not specified,
+                  block-dim defaults to 'dim'; if specified, it must be
+                  a divisor of 'dim'.
+
+       dropout-proportion=0.5   For conventional dropout, this is the proportion
+                  of mask values that (in expectation) are zero; it would
+                  normally be between 0 and 0.5.  The nonzero mask values
+                  will be given values 1.0 / dropout_proportion, so that the
+                  expected value is 1.0.  This behavior is different from
+                  DropoutComponent and DropoutMaskComponent.
+
+                  For continuous dropout (continuous==true), the dropout scales
+                  will have values (1.0 + 2 * dropout-proportion *
+                  Uniform[-1,1]).  This might seem like a strange choice, but it
+                  means that dropout-proportion=0.5 gives us a kind of
+                  'extremal' case where the dropout scales are distributed as
+                  Uniform[0, 2] and we can pass in the dropout scale as if it
+                  were a conventional dropout scale.
+
+       time-period=0   This determines how the dropout mask interacts
+                  with the time index (t).  In all cases, different sequences
+                  (different 'n' values) get different dropout masks.
+                  If time-period==0, then the dropout mask is shared across
+                  all time values.  If you set time-period > 0, then the
+                  dropout mask is shared across blocks of time values: for
+                  instance if time-period==10, then we'll use one dropout
+                  mask for t values 0 through 9, another for 10 through 19,
+                  and so on.  In all cases, the dropout mask will be shared
+                  across all 'x' values, although in most setups the x values
+                  are just zero so this isn't very interesting.
+                  If you set time-period==1 it would be similar to regular
+                  dropout, and it would probably make more sense to just use the
+                  normal DropoutComponent.
+
+ */
+class GeneralDropoutComponent: public RandomComponent {
+ public:
+  virtual int32 InputDim() const { return dim_; }
+
+  virtual int32 OutputDim() const { return dim_; }
+
+  virtual std::string Info() const;
+
+  virtual void InitFromConfig(ConfigLine *cfl);
+
+  GeneralDropoutComponent();
+
+  GeneralDropoutComponent(const GeneralDropoutComponent &other);
+
+  virtual std::string Type() const { return "GeneralDropoutComponent"; }
+  virtual int32 Properties() const {
+    return kRandomComponent|kPropagateInPlace|kBackpropInPlace|kUsesMemo|
+        (block_dim_ != dim_ ? (kInputContiguous|kOutputContiguous) : 0);
+  }
+
+  virtual void* Propagate(const ComponentPrecomputedIndexes *indexes,
+                          const CuMatrixBase<BaseFloat> &in,
+                          CuMatrixBase<BaseFloat> *out) const;
+  virtual void Backprop(const std::string &debug_info,
+                        const ComponentPrecomputedIndexes *indexes,
+                        const CuMatrixBase<BaseFloat> &, // in_value
+                        const CuMatrixBase<BaseFloat> &, // out_value
+                        const CuMatrixBase<BaseFloat> &out_deriv,
+                        void *memo,
+                        Component *to_update,
+                        CuMatrixBase<BaseFloat> *in_deriv) const;
+
+  virtual void DeleteMemo(void *memo) const {
+    delete static_cast<CuMatrix<BaseFloat>*>(memo);
+  }
+
+  virtual ComponentPrecomputedIndexes* PrecomputeIndexes(
+      const MiscComputationInfo &misc_info,
+      const std::vector<Index> &input_indexes,
+      const std::vector<Index> &output_indexes,
+      bool need_backprop) const;
+
+  virtual void Read(std::istream &is, bool binary);
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual Component* Copy() const;
+
+  void SetDropoutProportion(BaseFloat p) { dropout_proportion_ = p; }
+
+ private:
+
+  // Returns a random matrix of dimension 'num_mask_rows' by 'block_dim_'.  This
+  // should not be called if test_mode_ is true or dropout_proportion_ is zero.
+  CuMatrix<BaseFloat> *GetMemo(int32 num_mask_rows) const;
+
+
+  // The input and output dimension
+  int32 dim_;
+
+  // block_dim_ must divide dim_.
+  int32 block_dim_;
+
+  // time_period_ can be zero if we want all 't' values to share the same
+  // dropout mask, and a value more than zero if we want blocks of 't' values to
+  // share the dropout mask.  For example, if time_period_ is 10, blocks of size
+  // 10 frames will share the same dropout mask.
+  int32 time_period_;
+
+  BaseFloat dropout_proportion_;
+
+  bool continuous_;
+
+  bool test_mode_;
+
+  const GeneralDropoutComponent &operator
+  = (const GeneralDropoutComponent &other); // Disallow.
+};
+
+// This stores some precomputed indexes for GeneralDropoutComponent.
+// This object is created for every instance of the Propagate()
+// function in the compiled computation.
+class GeneralDropoutComponentPrecomputedIndexes:
+      public ComponentPrecomputedIndexes {
+ public:
+
+
+  // num_mask_rows is the number of rows in the dropout-mask matrix;
+  // it's num-cols is the block_dim_ of the component.
+  int32 num_mask_rows;
+
+  // 'indexes' is of dimension (the number of rows in the matrix we're doing
+  // Propagate() or Backprop() on) times the (dim_ / block_dim_) of the
+  // GeneralDropoutComponent.  Each value is in the range [0, num_mask_rows-1],
+  // and each value is repeated (dim_ / block_dim_) times.  This array is used
+  // to multiply the reshaped values or derivatives by the appropriate rows of
+  // the dropout matrix.
+  CuArray<int32> indexes;
+
+  virtual ~GeneralDropoutComponentPrecomputedIndexes() { }
+
+  ComponentPrecomputedIndexes *Copy() const {
+    return new GeneralDropoutComponentPrecomputedIndexes(*this);
+  }
+
+  virtual void Write(std::ostream &os, bool binary) const;
+
+  virtual void Read(std::istream &is, bool binary);
+
+  virtual std::string Type() const {
+    return "GeneralDropoutComponentPrecomputedIndexes";
+  }
+};
+
+
+
+
 
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-normalize-component.h b/src/nnet3/nnet-normalize-component.h
index 1806fe38493..37ad624d0f0 100644
--- a/src/nnet3/nnet-normalize-component.h
+++ b/src/nnet3/nnet-normalize-component.h
@@ -58,7 +58,7 @@ namespace nnet3 {
    Configuration values accepted:
       dim, or input-dim    Input dimension of this component, e.g. 1024.
                            Will be the same as the output dimension if add-log-stddev=false.
-      block-dim            Defaults to 'dim' you may specify a nonzero divisor
+      block-dim            Defaults to 'dim' you may specify a divisor
                            of 'dim'.  In this case the input dimension will
                            be interpreted as blocks of dimension 'block-dim'
                            to which the nonlinearity described above is applied
@@ -144,11 +144,11 @@ class NormalizeComponent: public Component {
 
     Accepted configuration values:
            dim          Dimension of the input and output
-           block-dim    Defaults to 'dim', but may be set to a nonzero divisor
+           block-dim    Defaults to 'dim', but may be set to a divisor
                         of 'dim'.  In this case, each block of dimension 'block-dim'
                         is treated like a separate row of the input matrix, which
                         means that the stats from n'th element of each
-                        block are pooled into one class, for each n.a
+                        block are pooled into one class, for each n.
            epsilon      Small term added to the variance that is used to prevent
                         division by zero
            target-rms   This defaults to 1.0, but if set, for instance, to 2.0,
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 6bff30c501b..812b66c41b1 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -90,7 +90,7 @@ void NnetTrainer::TrainInternal(const NnetExample &eg,
                                 const NnetComputation &computation) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
@@ -131,7 +131,7 @@ void NnetTrainer::TrainInternalBackstitch(const NnetExample &eg,
                                           bool is_backstitch_step1) {
   // note: because we give the 1st arg (nnet_) as a pointer to the
   // constructor of 'computer', it will use that copy of the nnet to
-  // store stats.  This is mainly important for memory-norm.
+  // store stats.
   NnetComputer computer(config_.compute_config, computation,
                         nnet_, delta_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index fd2229cace8..afe624f94ca 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -486,6 +486,10 @@ void SetDropoutProportion(BaseFloat dropout_proportion,
         dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
     if (mc != NULL)
       mc->SetDropoutProportion(dropout_proportion);
+    GeneralDropoutComponent *gdc =
+        dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
+    if (gdc != NULL)
+      gdc->SetDropoutProportion(dropout_proportion);
   }
 }
 
@@ -1172,12 +1176,17 @@ void ReadEditConfig(std::istream &edit_config_is, Nnet *nnet) {
              dynamic_cast<DropoutComponent*>(nnet->GetComponent(c));
           DropoutMaskComponent *mask_component =
              dynamic_cast<DropoutMaskComponent*>(nnet->GetComponent(c));
+          GeneralDropoutComponent *general_dropout_component =
+             dynamic_cast<GeneralDropoutComponent*>(nnet->GetComponent(c));
           if (dropout_component != NULL) {
             dropout_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
           } else if (mask_component != NULL){
             mask_component->SetDropoutProportion(proportion);
             num_dropout_proportions_set++;
+          } else if (general_dropout_component != NULL){
+            general_dropout_component->SetDropoutProportion(proportion);
+            num_dropout_proportions_set++;
           }
         }
       }
@@ -1461,9 +1470,10 @@ class ModelCollapser {
   /**
      Tries to produce a component that's equivalent to running the component
      'component_index2' with input given by 'component_index1'.  This handles
-     the case where 'component_index1' is of type DropoutComponent, and where
-     'component_index2' is of type AffineComponent,
-     NaturalGradientAffineComponent or TimeHeightConvolutionComponent.
+     the case where 'component_index1' is of type DropoutComponent or
+     GeneralDropoutComponent, and where 'component_index2' is of type
+     AffineComponent, NaturalGradientAffineComponent or
+     TimeHeightConvolutionComponent.
 
      Returns -1 if this code can't produce a combined component (normally
      because the components have the wrong types).
@@ -1473,10 +1483,23 @@ class ModelCollapser {
     const DropoutComponent *dropout_component =
         dynamic_cast<const DropoutComponent*>(
             nnet_->GetComponent(component_index1));
-    if (dropout_component == NULL)
+    const GeneralDropoutComponent *general_dropout_component =
+        dynamic_cast<const GeneralDropoutComponent*>(
+            nnet_->GetComponent(component_index1));
+
+    if (dropout_component == NULL && general_dropout_component == NULL)
       return -1;
-    BaseFloat dropout_proportion = dropout_component->DropoutProportion();
-    BaseFloat scale = 1.0 / (1.0 - dropout_proportion);
+    BaseFloat scale;  // the scale we have to apply to correct for removing
+                      // this dropout comonent.
+    if (dropout_component != NULL) {
+      BaseFloat dropout_proportion = dropout_component->DropoutProportion();
+      scale = 1.0 / (1.0 - dropout_proportion);
+    } else {
+      // for GeneralDropoutComponent, it's done in such a way that the expectation
+      // is always 1.  (When it's nonzero, we give it a value 1/(1-dropout_proportion).
+      // So no scaling is needed.
+      scale = 1.0;
+    }
     // note: if the 2nd component is not of a type that we can scale, the
     // following function call will return -1, which is OK.
     return GetScaledComponentIndex(component_index2,
diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h
index efa36e1f64c..4b105e30beb 100644
--- a/src/nnet3/nnet-utils.h
+++ b/src/nnet3/nnet-utils.h
@@ -189,7 +189,7 @@ void RecomputeStats(const std::vector<NnetExample> &egs, Nnet *nnet);
 
 
 /// This function affects components of child-classes of
-/// RandomComponent( currently only DropoutComponent and DropoutMaskComponent).
+/// RandomComponent.
 /// It sets "test mode" on such components (if you call it with test_mode =
 /// true, otherwise it would set normal mode, but this wouldn't be needed often).
 /// "test mode" means that having a mask containing (1-dropout_prob) in all
@@ -296,7 +296,8 @@ void CollapseModel(const CollapseModelConfig &config,
        'remove-orphans'.
 
     set-dropout-proportion [name=<name-pattern>] proportion=<dropout-proportion>
-       Sets the dropout rates for any components of type DropoutComponent whose
+       Sets the dropout rates for any components of type DropoutComponent,
+       DropoutMaskComponent or GeneralDropoutComponent whose
        names match the given <name-pattern> (e.g. lstm*).  <name-pattern> defaults to "*".
 
     apply-svd name=<name-pattern> bottleneck-dim=<dim>

From 8b500766fa97edcd34259a105926658019b03ecb Mon Sep 17 00:00:00 2001
From: Yiming Wang <freewym@gmail.com>
Date: Wed, 28 Feb 2018 20:18:00 -0500
Subject: [PATCH 151/184] [scripts] add --max-arcs option to
 scripts/rnnlm/lmrescore_pruned.sh  (#2245)

---
 scripts/rnnlm/lmrescore_pruned.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/scripts/rnnlm/lmrescore_pruned.sh b/scripts/rnnlm/lmrescore_pruned.sh
index e757508990b..506527f4f6b 100755
--- a/scripts/rnnlm/lmrescore_pruned.sh
+++ b/scripts/rnnlm/lmrescore_pruned.sh
@@ -17,6 +17,7 @@ max_ngram_order=4 # Approximate the lattice-rescoring by limiting the max-ngram-
                   # exploding exponentially. Details of the n-gram approximation
                   # method are described in section 2.3 of the paper
                   # http://www.danielpovey.com/files/2018_icassp_lattice_pruning.pdm
+max_arcs=         # limit the max arcs in lattice while rescoring. E.g., 20000
 
 acwt=0.1
 weight=0.5  # Interpolation weight for RNNLM.
@@ -85,13 +86,18 @@ else
   word_embedding="'rnnlm-get-word-embedding $rnnlm_dir/word_feats.txt $rnnlm_dir/feat_embedding.final.mat -|'"
 fi
 
+max_arcs_opt=
+if [ ! -z "$max_arcs" ]; then
+  max_arcs_opt="--max-arcs=$max_arcs"
+fi
+
 mkdir -p $outdir/log
 nj=`cat $indir/num_jobs` || exit 1;
 cp $indir/num_jobs $outdir
 
 $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
   lattice-lmrescore-kaldi-rnnlm-pruned --lm-scale=$weight $special_symbol_opts \
-    --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt \
+    --acoustic-scale=$acwt --max-ngram-order=$max_ngram_order $normalize_opt $max_arcs_opt \
     $carpa_option $oldlm $word_embedding "$rnnlm_dir/final.raw" \
     "ark:gunzip -c $indir/lat.JOB.gz|" "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
 

From 4e2016605e4d3cd7554c4d557217c56ed334ea1e Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 30 Oct 2017 12:26:39 -0400
Subject: [PATCH 152/184] basic directory structure

---
 egs/material/README                   |  0
 egs/material/s5/README                |  0
 egs/material/s5/RESULTS               |  0
 egs/material/s5/cmd.sh                | 14 ++++++++++++++
 egs/material/s5/conf/decode.config    |  1 +
 egs/material/s5/conf/mfcc.conf        |  2 ++
 egs/material/s5/conf/mfcc_hires.conf  | 10 ++++++++++
 egs/material/s5/conf/online_cmvn.conf |  1 +
 egs/material/s5/conf/plp.conf         |  1 +
 egs/material/s5/path.sh               |  8 ++++++++
 egs/material/s5/steps                 |  1 +
 egs/material/s5/utils                 |  1 +
 12 files changed, 39 insertions(+)
 create mode 100644 egs/material/README
 create mode 100644 egs/material/s5/README
 create mode 100644 egs/material/s5/RESULTS
 create mode 100644 egs/material/s5/cmd.sh
 create mode 100644 egs/material/s5/conf/decode.config
 create mode 100644 egs/material/s5/conf/mfcc.conf
 create mode 100644 egs/material/s5/conf/mfcc_hires.conf
 create mode 100644 egs/material/s5/conf/online_cmvn.conf
 create mode 100644 egs/material/s5/conf/plp.conf
 create mode 100644 egs/material/s5/path.sh
 create mode 120000 egs/material/s5/steps
 create mode 120000 egs/material/s5/utils

diff --git a/egs/material/README b/egs/material/README
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/README b/egs/material/s5/README
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/RESULTS b/egs/material/s5/RESULTS
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs/material/s5/cmd.sh b/egs/material/s5/cmd.sh
new file mode 100644
index 00000000000..811adcde474
--- /dev/null
+++ b/egs/material/s5/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/material/s5/conf/decode.config b/egs/material/s5/conf/decode.config
new file mode 100644
index 00000000000..7ba966f2b83
--- /dev/null
+++ b/egs/material/s5/conf/decode.config
@@ -0,0 +1 @@
+# empty config, just use the defaults.
diff --git a/egs/material/s5/conf/mfcc.conf b/egs/material/s5/conf/mfcc.conf
new file mode 100644
index 00000000000..e6defc10078
--- /dev/null
+++ b/egs/material/s5/conf/mfcc.conf
@@ -0,0 +1,2 @@
+--use-energy=false
+--sample-frequency=8000 
diff --git a/egs/material/s5/conf/mfcc_hires.conf b/egs/material/s5/conf/mfcc_hires.conf
new file mode 100644
index 00000000000..f218143e78a
--- /dev/null
+++ b/egs/material/s5/conf/mfcc_hires.conf
@@ -0,0 +1,10 @@
+# config for high-resolution MFCC features, intended for neural network training.
+# Note: we keep all cepstra, so it has the same info as filterbank features,
+# but MFCC is more easily compressible (because less correlated) which is why
+# we prefer this method.
+--use-energy=false   # use average of log energy, not energy.
+--sample-frequency=8000 # most of the files are 8kHz
+--num-mel-bins=40     # similar to Google's setup.
+--num-ceps=40     # there is no dimensionality reduction.
+--low-freq=40    # low cutoff frequency for mel bins
+--high-freq=-200 # high cutoff frequently, relative to Nyquist of 4000 (=3800)
diff --git a/egs/material/s5/conf/online_cmvn.conf b/egs/material/s5/conf/online_cmvn.conf
new file mode 100644
index 00000000000..7748a4a4dd3
--- /dev/null
+++ b/egs/material/s5/conf/online_cmvn.conf
@@ -0,0 +1 @@
+# configuration file for apply-cmvn-online, used in the script ../local/run_online_decoding.sh
diff --git a/egs/material/s5/conf/plp.conf b/egs/material/s5/conf/plp.conf
new file mode 100644
index 00000000000..926bcfca92a
--- /dev/null
+++ b/egs/material/s5/conf/plp.conf
@@ -0,0 +1 @@
+--sample-frequency=8000
diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
new file mode 100644
index 00000000000..705600ad47a
--- /dev/null
+++ b/egs/material/s5/path.sh
@@ -0,0 +1,8 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
+
+# For now, don't include any of the optional dependenices of the main
+# librispeech recipe
diff --git a/egs/material/s5/steps b/egs/material/s5/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/material/s5/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/material/s5/utils b/egs/material/s5/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/material/s5/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file

From b949855db39bc42c200725cebf3d935defe20076 Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 30 Oct 2017 21:09:57 -0400
Subject: [PATCH 153/184] basic data setup ready

---
 egs/material/s5/local/audio2wav_scp.pl       |  63 ++++++
 egs/material/s5/local/cleanup_transcripts.pl |  89 ++++++++
 egs/material/s5/local/convert_lexicon.pl     |  72 ++++++
 egs/material/s5/local/create_datafiles.pl    |  67 ++++++
 egs/material/s5/local/parse_transcripts.pl   |  53 +++++
 egs/material/s5/local/prepare_audio_data.sh  |  40 ++++
 egs/material/s5/local/prepare_dict.sh        |  42 ++++
 egs/material/s5/local/prepare_text_data.sh   |  43 ++++
 egs/material/s5/local/train_lms_srilm.sh     | 224 +++++++++++++++++++
 egs/material/s5/path.sh                      |   2 +-
 egs/material/s5/run.sh                       |  35 +++
 11 files changed, 729 insertions(+), 1 deletion(-)
 create mode 100755 egs/material/s5/local/audio2wav_scp.pl
 create mode 100755 egs/material/s5/local/cleanup_transcripts.pl
 create mode 100755 egs/material/s5/local/convert_lexicon.pl
 create mode 100755 egs/material/s5/local/create_datafiles.pl
 create mode 100755 egs/material/s5/local/parse_transcripts.pl
 create mode 100755 egs/material/s5/local/prepare_audio_data.sh
 create mode 100755 egs/material/s5/local/prepare_dict.sh
 create mode 100755 egs/material/s5/local/prepare_text_data.sh
 create mode 100755 egs/material/s5/local/train_lms_srilm.sh
 create mode 100755 egs/material/s5/run.sh

diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
new file mode 100755
index 00000000000..ed3abc330e0
--- /dev/null
+++ b/egs/material/s5/local/audio2wav_scp.pl
@@ -0,0 +1,63 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+my $text2id_name = $ARGV[0];
+open(my $text2id_file, "<$text2id_name") or 
+  die "Cannot open $text2id_name: $!";
+
+my %text2id;
+while (<$text2id_file>) {
+  chomp;
+  (my $id, my $name) = split;
+  $name =~ s/\.txt$//;
+  
+  die "Duplicate ID $name\n" if defined($text2id{$name});
+  $text2id{$name} = $id;
+}
+close($text2id_file);
+
+
+my $sox =  `which sox` or die "The sox binary does not exist";
+chomp $sox;
+my $sph2pipe = `which sph2pipe` or die "The sph2pipe binary does not exist";
+chomp $sph2pipe;
+
+while(<STDIN>) {
+  chomp;
+  my $full_path = $_;
+  (my $basename = $full_path) =~ s/.*\///g;
+
+  die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g; 
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g; 
+
+  die "Transcription for $name does not exist" unless
+    defined($text2id{$name});
+
+  if ($ext eq "wav") {
+    print "$text2id{$name} $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+  } else {
+    print "$text2id{$name} $sph2pipe -f wav -p -c 1 $full_path|\n";
+  }
+}
+
+
diff --git a/egs/material/s5/local/cleanup_transcripts.pl b/egs/material/s5/local/cleanup_transcripts.pl
new file mode 100755
index 00000000000..84e6e615c0d
--- /dev/null
+++ b/egs/material/s5/local/cleanup_transcripts.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+# replacement of the smart-match operator (apparently not supported anymore)
+sub is_elem {
+  my $word = shift;
+  my $array = shift;
+  foreach my $other_word (@{$array}) {
+    return 1 if $word eq $other_word;
+  }
+  return 0;
+}
+
+my $unk = "<unk>";
+my $noise = "<noise>";
+my $spnoise = "<spnoise>";
+my $sil = "<sil>";
+
+my @ignore_events = ("<female-to-male>", "<male-to-female>");
+my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>");
+my @sil_events = ("<no-speech>");
+my @noise_events = ("<sta>", "<ring>", "<int>" );
+my @spnoise_events = ("<breath>", "<cough>", "<hes>", "<laugh>", "<click>", "<lipsmack>");
+
+
+
+UTT: while(<>) {
+  chomp;
+  my @line = split " ", $_;
+  my $file = shift @line;
+  my $begin = shift @line;
+  my $end = shift @line;
+
+  next if (@line == 1) and ($line[0] eq "<no-speech>");
+  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all 
+                                                 #it contains is a non-speech event
+
+  my @out_line;
+  foreach my $word (@line) {
+    if ($word =~ /.*-$/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^-.*/) {
+      push @out_line, $unk;
+    } elsif ($word =~ /^\*.*\*$/) {
+      push @out_line, $unk;
+    } elsif ($word eq "(())") {
+      push @out_line, $unk;
+    } elsif (is_elem $word, \@ignore_events) {
+      next;
+    } elsif (is_elem $word, \@ignore_utt_events) {
+      next UTT;
+    } elsif (is_elem $word, \@sil_events) {
+      push @out_line, $sil;
+    } elsif (is_elem $word, \@noise_events) {
+      push @out_line, $noise;
+    } elsif (is_elem $word, \@spnoise_events) {
+      push @out_line, $spnoise;
+    } else {
+      push @out_line, $word;
+    }
+  }
+  print "$file\t$begin\t$end\t" . join(" ", @out_line) . "\n" if @out_line;
+
+}
+
+
diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl
new file mode 100755
index 00000000000..3fc4b3d3417
--- /dev/null
+++ b/egs/material/s5/local/convert_lexicon.pl
@@ -0,0 +1,72 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+my $lexicon_name = $ARGV[0];
+open(my $lexicon_file, "<:utf8", $lexicon_name) or
+  die "Cannot open $lexicon_name: $!\n";
+
+my $wordlist_name = $ARGV[1];
+open(my $wordlist_file, "<:utf8", $wordlist_name) or
+  die "Cannot open $wordlist_name: $!\n";
+
+
+my %lexicon;
+while (<$lexicon_file>) {
+  chomp;
+  (my $word, my $prons) = split " ", $_, 2;
+  $lexicon{$word} = $prons;
+}
+
+while (<$wordlist_file>) {
+  chomp;
+  my $word = $_;
+  print "Cannot find word $word in lexicon\n" unless defined($lexicon{$word});
+
+  #print "$word $lexicon{$word}\n";
+
+  my @prons = split "\t", $lexicon{$word};
+  foreach my $pron (@prons) {
+    my @phones = split " ", $pron;
+    my $stress_mark = 0; 
+    my @out_phones = ();
+    foreach my $phone (@phones) {
+      if ($phone eq "\"") {
+        $stress_mark = 1
+      } elsif ( $phone eq "." ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } elsif ( $phone eq "#" ) {
+        $stress_mark = 0;
+        push @out_phones, '.';
+      } else {
+        $phone =~ s/_/+/g;
+        #let's just ignore stress for now
+        #$phone = "${phone}_\"" if $stress_mark;
+        push @out_phones, $phone; 
+      }
+    }
+    my $out_pron = join(" ", @out_phones);
+    $out_pron =~ s/ *\. */\t/g;
+    print "$word\t$out_pron\n";
+  }
+}
+
diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl
new file mode 100755
index 00000000000..f33027f4bd5
--- /dev/null
+++ b/egs/material/s5/local/create_datafiles.pl
@@ -0,0 +1,67 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $output = $ARGV[0];
+open(my $utt2spk, ">:utf8", "$output/utt2spk") or
+  die "Cannot open $output/utt2spk: $!\n";
+open(my $text, ">:utf8", "$output/text") or 
+  die "Cannot open $output/text: $!\n";
+open(my $segments, ">:utf8", "$output/segments") or
+  die "Cannot open $output/segments: $!\n";
+open(my $wav, ">:utf8", "$output/wav2file") or
+  die "Cannot open $output/wav2file: $!\n";
+
+my %text2id;
+while(<STDIN>) {
+  chomp;
+  my @line = split (" ", $_, 4);
+  my $name = shift @line;
+  my $begin =  shift @line;
+  my $end = shift @line;
+  my $words = shift @line;
+  my $name_raw = $name;
+
+  my $begin_text = sprintf("%07d", $begin * 1000);
+  my $end_text = sprintf("%07d", $end * 1000);
+  
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
+  
+  my $utt_name = join("_", $name, $begin_text, $end_text);
+  print $segments "$utt_name $name $begin $end\n";
+  print $utt2spk  "$utt_name $name\n";
+  print $text "$utt_name $words\n";
+  if (defined $text2id{$name}) {
+    die "" if $text2id{$name} ne $name_raw;
+  } else {
+    print $wav "$name $name_raw\n";
+    $text2id{$name} = $name_raw;
+  }
+}
diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl
new file mode 100755
index 00000000000..20a2d171a11
--- /dev/null
+++ b/egs/material/s5/local/parse_transcripts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/env perl
+#===============================================================================
+# Copyright 2017  (Author: Yenda Trmal <jtrmal@gmail.com>)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+#===============================================================================
+
+use strict;
+use warnings;
+use utf8;
+
+binmode STDIN, "utf8";
+binmode STDOUT, "utf8";
+binmode STDERR, "utf8";
+
+my $file = $ARGV[0];
+
+open(my $transcript, "<:utf8", $file) or 
+  die "Cannot open file $file: $!\n";
+
+(my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g;
+
+my $sentence = undef;
+my $begin_time = undef;
+my $end_time = undef;
+while(<$transcript>) {
+  chomp;
+  if (/^\[([0-9.]+)\]$/) {
+    $begin_time = $end_time;
+    $end_time = $1;
+    if ($sentence) {
+      print "$basename\t$begin_time\t$end_time\t$sentence\n";
+      $sentence = undef;
+    }
+  } else {
+    die "Invalid format of the transcription in $basename\n" if defined($sentence);
+    $sentence = $_;
+  }
+}
+
+die "Invalid format of the transcription in $basename\n" if defined($sentence);
+
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
new file mode 100755
index 00000000000..9e690a02e1c
--- /dev/null
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit 
+fi
+data=$1
+
+conversational_train=$data/conversational/training/
+audio=$conversational_train/audio/ 
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+[ ! -f data/train/wav2file ] && \
+  echo "File data/train/wav2file not found. Was text preparation run?" && exit 1 
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl <(sort -u data/train/wav2file) > data/train/wav.scp
+
+
+conversational_dev=$data/conversational/dev
+audio=$conversational_dev/audio/ 
+[ ! -d $audio ] && \
+  echo "The directory $audio does not exist!" && exit 1
+[ ! -f data/dev/wav2file ] && \
+  echo "File data/dev/wav2file not found. Was text preparation run?" && exit 1 
+
+find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
+  local/audio2wav_scp.pl <(sort -u data/dev/wav2file) > data/dev/wav.scp
+
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
new file mode 100755
index 00000000000..8e15e4deec5
--- /dev/null
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit 
+fi
+data=$1
+
+lexicon=$data/conversational/reference_materials/lexicon.txt
+[ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
+
+mkdir -p data/local/dict/
+cat data/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | sort -u > data/local/dict/wordlist
+
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) <(grep -v -F '~' data/local/dict/wordlist) | sort -u > data/local/dict/lexicon.txt
+
+
+cat data/local/dict/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict/phones.txt
+
+
+grep "^<.*>$" data/local/dict/phones.txt  > data/local/dict/silence_phones.txt
+grep -v "^<.*>$" data/local/dict/phones.txt  > data/local/dict/nonsilence_phones.txt
+echo "<sil>" > data/local/dict/optional_silence.txt
+echo "<unk>" > data/local/dict/oov.txt
+
+
+
+utils/validate_dict_dir.pl data/local/dict/
+
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
new file mode 100755
index 00000000000..42cb5354c6e
--- /dev/null
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+echo "$0 " "$@"
+
+if [ $# -ne 1 ] ; then
+  echo "Invalid number of script parameters. "
+  echo "  $0 <path-to-material-corpus>"
+  echo "e.g."
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  exit 
+fi
+data=$1;
+conversational_train=$data/conversational/training/
+mkdir -p data/train/
+for file in $conversational_train/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done  > data/train/transcripts.txt
+
+
+conversational_dev=$data/conversational/dev/
+mkdir -p data/dev
+for file in $conversational_dev/transcription/*txt ; do
+  ./local/parse_transcripts.pl $file
+done > data/dev/transcripts.txt
+
+
+cat data/train/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/train/
+
+cat data/dev/transcripts.txt | \
+  local/cleanup_transcripts.pl | \
+  local/create_datafiles.pl data/dev/
+
+
+
+
diff --git a/egs/material/s5/local/train_lms_srilm.sh b/egs/material/s5/local/train_lms_srilm.sh
new file mode 100755
index 00000000000..8160b060dc7
--- /dev/null
+++ b/egs/material/s5/local/train_lms_srilm.sh
@@ -0,0 +1,224 @@
+#!/bin/bash
+export LC_ALL=C
+
+words_file=
+train_text=
+dev_text=
+oov_symbol="<UNK>"
+
+echo "$0 $@"
+
+[ -f path.sh ]  && . ./path.sh
+. ./utils/parse_options.sh || exit 1
+
+echo "-------------------------------------"
+echo "Building an SRILM language model     "
+echo "-------------------------------------"
+
+if [ $# -ne 2 ] ; then
+  echo "Incorrect number of parameters. "
+  echo "Script has to be called like this:"
+  echo "  $0 [switches] <datadir> <tgtdir>"
+  echo "For example: "
+  echo "  $0 data data/srilm"
+  echo "The allowed switches are: "
+  echo "    words_file=<word_file|>        word list file -- data/lang/words.txt by default"
+  echo "    train_text=<train_text|>       data/train/text is used in case when not specified"
+  echo "    dev_text=<dev_text|>           last 10 % of the train text is used by default"
+  echo "    oov_symbol=<unk_sumbol|<UNK>>  symbol to use for oov modeling -- <UNK> by default"
+  exit 1
+fi
+
+datadir=$1
+tgtdir=$2
+outlm=lm.gz
+
+
+##End of configuration
+loc=`which ngram-count`;
+if [ -z $loc ]; then
+  if uname -a | grep 64 >/dev/null; then # some kind of 64 bit...
+    sdir=`pwd`/../../../tools/srilm/bin/i686-m64
+  else
+    sdir=`pwd`/../../../tools/srilm/bin/i686
+  fi
+  if [ -f $sdir/ngram-count ]; then
+    echo Using SRILM tools from $sdir
+    export PATH=$PATH:$sdir
+  else
+    echo You appear to not have SRILM tools installed, either on your path,
+    echo or installed in $sdir.  See tools/install_srilm.sh for installation
+    echo instructions.
+    exit 1
+  fi
+fi
+
+# Prepare the destination directory
+mkdir -p $tgtdir
+
+for f in $words_file $train_text $dev_text; do
+  [ ! -s $f ] && echo "No such file $f" && exit 1;
+done
+
+[ -z $words_file ] && words_file=$datadir/lang/words.txt
+if [ ! -z "$train_text" ] && [ ! -z "$dev_text" ] ; then
+  echo "Using words file: $words_file"
+  echo "Using train text: $train_text"
+  echo "Using dev text  : $dev_text"
+  train_text=$train_text
+  dev_text=$dev_text
+else
+  [ -z "$train_text" ] && train_text=$datadir/train/text
+  nr=`cat  $train_text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+  orig_train_text=$train_text
+  head -n $nr_train $train_text > $tgtdir/train_text
+  tail -n $nr_dev $train_text > $tgtdir/dev_text
+
+  train_text=$tgtdir/train_text
+  dev_text=$tgtdir/dev_text
+  echo "Using words file: $words_file"
+  echo "Using train text: 9/10 of $orig_train_text"
+  echo "Using dev text  : 1/10 of $orig_train_text"
+fi
+
+
+
+# Extract the word list from the training dictionary; exclude special symbols
+sort $words_file | awk '{print $1}' | grep -v '\#0' | grep -v '<eps>' | grep -v -F "$oov_symbol" > $tgtdir/vocab
+if (($?)); then
+  echo "Failed to create vocab from $words_file"
+  exit 1
+else
+  # wc vocab # doesn't work due to some encoding issues
+  echo vocab contains `cat $tgtdir/vocab | perl -ne 'BEGIN{$l=$w=0;}{split; $w+=$#_; $w++; $l++;}END{print "$l lines, $w words\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $train_text | cut -f2- -d' ' > $tgtdir/train.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/train.txt from $train_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $train_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $train_text contains `cat $train_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo train.txt contains `cat $tgtdir/train.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+# Kaldi transcript files contain Utterance_ID as the first word; remove it
+cat $dev_text | cut -f2- -d' ' > $tgtdir/dev.txt
+if (($?)); then
+    echo "Failed to create $tgtdir/dev.txt from $dev_text"
+    exit 1
+else
+    echo "Removed first word (uid) from every line of $dev_text"
+    # wc text.train train.txt # doesn't work due to some encoding issues
+    echo $dev_text contains `cat $dev_text | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F; $w--; $s++;}END{print "$w words, $s sentences\n";}'`
+    echo $tgtdir/dev.txt contains `cat $tgtdir/dev.txt | perl -ane 'BEGIN{$w=$s=0;}{$w+=@F;  $s++;}END{print "$w words, $s sentences\n";}'`
+fi
+
+echo "-------------------"
+echo "Good-Turing 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.gt01.gz -gt1min 0 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.gt02.gz -gt1min 0 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 2grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/2gram.kn01.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/2gram.kn02.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -order 2 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Good-Turing 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.gt011.gz -gt1min 0 -gt2min 1 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt012.gz -gt1min 0 -gt2min 1 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt022.gz -gt1min 0 -gt2min 2 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.gt023.gz -gt1min 0 -gt2min 2 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 3grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/3gram.kn011.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn012.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn022.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/3gram.kn023.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 3 -order 3 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+
+echo "-------------------"
+echo "Good-Turing 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.gt0111.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0112.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0122.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0123.gz -gt1min 0 -gt2min 1 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0113.gz -gt1min 0 -gt2min 1 -gt3min 1 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0222.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.gt0223.gz -gt1min 0 -gt2min 2 -gt3min 2 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+echo "-------------------"
+echo "Kneser-Ney 4grams"
+echo "-------------------"
+ngram-count -lm $tgtdir/4gram.kn0111.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 1 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0112.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0113.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 1 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0122.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0123.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 1 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0222.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 2 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+ngram-count -lm $tgtdir/4gram.kn0223.gz -kndiscount1 -gt1min 0 -kndiscount2 -gt2min 2 -kndiscount3 -gt3min 2 -kndiscount4 -gt4min 3 -order 4 -text $tgtdir/train.txt -vocab $tgtdir/vocab -unk -sort -map-unk "$oov_symbol"
+
+if [ ! -z ${LIBLBFGS} ]; then
+  #please not that if the switch -map-unk "$oov_symbol" is used with -maxent-convert-to-arpa, ngram-count will segfault
+  #instead of that, we simply output the model in the maxent format and convert it using the "ngram"
+  echo "-------------------"
+  echo "Maxent 2grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 2 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/2gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 3grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 3 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/3gram.me.gz || exit 1
+
+  echo "-------------------"
+  echo "Maxent 4grams"
+  echo "-------------------"
+  sed 's/'${oov_symbol}'/<unk>/g' $tgtdir/train.txt | \
+    ngram-count -lm - -order 4 -text - -vocab $tgtdir/vocab -unk -sort -maxent -maxent-convert-to-arpa|\
+    sed 's/<unk>/'${oov_symbol}'/g' | gzip -c > $tgtdir/4gram.me.gz || exit 1
+
+fi
+
+
+echo "--------------------"
+echo "Computing perplexity"
+echo "--------------------"
+(
+  for f in $tgtdir/3gram* ; do ( echo $f; ngram -order 3 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+  for f in $tgtdir/4gram* ; do ( echo $f; ngram -order 4 -lm $f -unk -map-unk "$oov_symbol" -ppl $tgtdir/dev.txt ) | paste -s -d ' ' ; done
+)  | sort  -r -n -k 15,15g | column -t | tee $tgtdir/perplexities.txt
+
+echo "The perlexity scores report is stored in $tgtdir/perplexities.txt "
+
+#This will link the lowest perplexity LM as the output LM.
+#ln -sf $tgtdir/`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '` $outlm
+
+#A slight modification of the previous approach:
+#We look at the two lowest perplexity LMs and use a 3gram LM if one of the two, even if the 4gram is of lower ppl
+nof_trigram_lm=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | wc -l`
+if [[ $nof_trigram_lm -eq 0 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+elif [[ $nof_trigram_lm -eq 2 ]] ; then
+  lmfilename=`head -n 1 $tgtdir/perplexities.txt | cut -f 1 -d ' '`
+else  #exactly one 3gram LM
+  lmfilename=`head -n 2 $tgtdir/perplexities.txt | grep 3gram | cut -f 1 -d ' '`
+fi
+(cd $tgtdir; ln -sf `basename $lmfilename` $outlm )
+
diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
index 705600ad47a..6dd1bcc0a27 100644
--- a/egs/material/s5/path.sh
+++ b/egs/material/s5/path.sh
@@ -1,5 +1,5 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
 export LC_ALL=C
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
new file mode 100755
index 00000000000..492643def10
--- /dev/null
+++ b/egs/material/s5/run.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./path.sh
+. ./cmd.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+
+local/prepare_text_data.sh $corpus
+local/prepare_audio_data.sh $corpus
+
+utils/fix_data_dir.sh data/train
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/train
+utils/validate_data_dir.sh data/train
+
+utils/fix_data_dir.sh data/dev
+steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/dev
+utils/validate_data_dir.sh data/dev
+
+local/prepare_dict.sh $corpus
+utils/validate_dict_dir.pl data/local/dict
+utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+utils/validate_lang.pl data/lang
+
+local/train_lms_srilm.sh --oov-symbol "<unk>" data data/lm
+utils/format_lm.sh data/lang data/lm/lm.gz \
+  data/local/dict/lexiconp.txt data/lang_test
+utils/validate_lang.pl data/lang_test
+

From 66c138d8f7ee69a13a0e8bc456f95bbc05af7f37 Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 30 Oct 2017 21:16:14 -0400
Subject: [PATCH 154/184] adding scoring script

---
 egs/material/s5/local/score.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100755 egs/material/s5/local/score.sh

diff --git a/egs/material/s5/local/score.sh b/egs/material/s5/local/score.sh
new file mode 100755
index 00000000000..c7da00fba32
--- /dev/null
+++ b/egs/material/s5/local/score.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+

From a557fddbfd7f19f566d7c0da5a119359035a0cb4 Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 6 Nov 2017 14:06:01 -0500
Subject: [PATCH 155/184] resolve utf-8 encoding and some other details

---
 egs/material/s5/local/audio2wav_scp.pl       | 34 ++++++++------------
 egs/material/s5/local/cleanup_transcripts.pl |  5 +--
 egs/material/s5/local/convert_lexicon.pl     | 12 ++++---
 egs/material/s5/local/create_datafiles.pl    |  8 +++--
 egs/material/s5/local/parse_transcripts.pl   |  2 +-
 egs/material/s5/local/prepare_audio_data.sh  | 14 +++-----
 egs/material/s5/local/prepare_dict.sh        |  6 ++--
 egs/material/s5/local/prepare_text_data.sh   |  2 +-
 egs/wsj/s5/utils/validate_dict_dir.pl        |  2 +-
 9 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/egs/material/s5/local/audio2wav_scp.pl b/egs/material/s5/local/audio2wav_scp.pl
index ed3abc330e0..f051c2714d2 100755
--- a/egs/material/s5/local/audio2wav_scp.pl
+++ b/egs/material/s5/local/audio2wav_scp.pl
@@ -20,21 +20,6 @@
 use warnings;
 use utf8;
 
-my $text2id_name = $ARGV[0];
-open(my $text2id_file, "<$text2id_name") or 
-  die "Cannot open $text2id_name: $!";
-
-my %text2id;
-while (<$text2id_file>) {
-  chomp;
-  (my $id, my $name) = split;
-  $name =~ s/\.txt$//;
-  
-  die "Duplicate ID $name\n" if defined($text2id{$name});
-  $text2id{$name} = $id;
-}
-close($text2id_file);
-
 
 my $sox =  `which sox` or die "The sox binary does not exist";
 chomp $sox;
@@ -47,16 +32,23 @@
   (my $basename = $full_path) =~ s/.*\///g;
 
   die "The filename $basename does not match the expected naming pattern!" unless $basename =~ /.*\.(wav|sph)$/;
-  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g; 
-  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g; 
+  (my $ext = $basename) =~ s/.*\.(wav|sph)$/$1/g;
+  (my $name = $basename) =~ s/(.*)\.(wav|sph)$/$1/g;
+
 
-  die "Transcription for $name does not exist" unless
-    defined($text2id{$name});
+  # name looks like this:
+  #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.sph
+  # Please note that the naming pattern must match
+  # the pattern in create_datafiles.pl
+  $name =~ s/inLine.*/0/g;
+  $name =~ s/outLine.*/1/g;
+  $name =~ s/_BASE//g;
+  $name =~ s/-BUILD//g;
 
   if ($ext eq "wav") {
-    print "$text2id{$name} $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
+    print "$name $sox $full_path -r 8000 -c 1 -b 16 -t wav - downsample|\n";
   } else {
-    print "$text2id{$name} $sph2pipe -f wav -p -c 1 $full_path|\n";
+    print "$name $sph2pipe -f wav -p -c 1 $full_path|\n";
   }
 }
 
diff --git a/egs/material/s5/local/cleanup_transcripts.pl b/egs/material/s5/local/cleanup_transcripts.pl
index 84e6e615c0d..6cd237c5b7e 100755
--- a/egs/material/s5/local/cleanup_transcripts.pl
+++ b/egs/material/s5/local/cleanup_transcripts.pl
@@ -40,7 +40,8 @@ sub is_elem {
 my $sil = "<sil>";
 
 my @ignore_events = ("<female-to-male>", "<male-to-female>");
-my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>");
+#as per the BABEL docs, ~ means truncation of the word/utterance
+my @ignore_utt_events = ("<overlap>", "<dtmf>", "<foreign>", "~");
 my @sil_events = ("<no-speech>");
 my @noise_events = ("<sta>", "<ring>", "<int>" );
 my @spnoise_events = ("<breath>", "<cough>", "<hes>", "<laugh>", "<click>", "<lipsmack>");
@@ -55,7 +56,7 @@ sub is_elem {
   my $end = shift @line;
 
   next if (@line == 1) and ($line[0] eq "<no-speech>");
-  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all 
+  next if (@line == 1) and ($line[0] =~ "<.*>"); #skip the utterance if all
                                                  #it contains is a non-speech event
 
   my @out_line;
diff --git a/egs/material/s5/local/convert_lexicon.pl b/egs/material/s5/local/convert_lexicon.pl
index 3fc4b3d3417..e8f7092fe5f 100755
--- a/egs/material/s5/local/convert_lexicon.pl
+++ b/egs/material/s5/local/convert_lexicon.pl
@@ -20,12 +20,16 @@
 use warnings;
 use utf8;
 
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
 my $lexicon_name = $ARGV[0];
-open(my $lexicon_file, "<:utf8", $lexicon_name) or
+open(my $lexicon_file, "<:encoding(UTF-8)", $lexicon_name) or
   die "Cannot open $lexicon_name: $!\n";
 
 my $wordlist_name = $ARGV[1];
-open(my $wordlist_file, "<:utf8", $wordlist_name) or
+open(my $wordlist_file, "<:encoding(UTF-8)", $wordlist_name) or
   die "Cannot open $wordlist_name: $!\n";
 
 
@@ -46,7 +50,7 @@
   my @prons = split "\t", $lexicon{$word};
   foreach my $pron (@prons) {
     my @phones = split " ", $pron;
-    my $stress_mark = 0; 
+    my $stress_mark = 0;
     my @out_phones = ();
     foreach my $phone (@phones) {
       if ($phone eq "\"") {
@@ -61,7 +65,7 @@
         $phone =~ s/_/+/g;
         #let's just ignore stress for now
         #$phone = "${phone}_\"" if $stress_mark;
-        push @out_phones, $phone; 
+        push @out_phones, $phone;
       }
     }
     my $out_pron = join(" ", @out_phones);
diff --git a/egs/material/s5/local/create_datafiles.pl b/egs/material/s5/local/create_datafiles.pl
index f33027f4bd5..d8e692524a1 100755
--- a/egs/material/s5/local/create_datafiles.pl
+++ b/egs/material/s5/local/create_datafiles.pl
@@ -27,7 +27,7 @@
 my $output = $ARGV[0];
 open(my $utt2spk, ">:utf8", "$output/utt2spk") or
   die "Cannot open $output/utt2spk: $!\n";
-open(my $text, ">:utf8", "$output/text") or 
+open(my $text, ">:utf8", "$output/text") or
   die "Cannot open $output/text: $!\n";
 open(my $segments, ">:utf8", "$output/segments") or
   die "Cannot open $output/segments: $!\n";
@@ -46,14 +46,16 @@
 
   my $begin_text = sprintf("%07d", $begin * 1000);
   my $end_text = sprintf("%07d", $end * 1000);
-  
+
   # name looks like this:
   #   MATERIAL_BASE-1A-BUILD_10002_20131130_011225_inLine.txt
+  # Please note that the naming pattern must match
+  # the pattern in audio2wav_scp.pl
   $name =~ s/inLine.*/0/g;
   $name =~ s/outLine.*/1/g;
   $name =~ s/_BASE//g;
   $name =~ s/-BUILD//g;
-  
+
   my $utt_name = join("_", $name, $begin_text, $end_text);
   print $segments "$utt_name $name $begin $end\n";
   print $utt2spk  "$utt_name $name\n";
diff --git a/egs/material/s5/local/parse_transcripts.pl b/egs/material/s5/local/parse_transcripts.pl
index 20a2d171a11..06c18a30c6c 100755
--- a/egs/material/s5/local/parse_transcripts.pl
+++ b/egs/material/s5/local/parse_transcripts.pl
@@ -26,7 +26,7 @@
 
 my $file = $ARGV[0];
 
-open(my $transcript, "<:utf8", $file) or 
+open(my $transcript, "<:utf8", $file) or
   die "Cannot open file $file: $!\n";
 
 (my $basename = $file) =~ s/(.*\/)?([^\/]+)/$2/g;
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
index 9e690a02e1c..2bf9283f435 100755
--- a/egs/material/s5/local/prepare_audio_data.sh
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -13,28 +13,24 @@ if [ $# -ne 1 ] ; then
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."
   echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
-  exit 
+  exit
 fi
 data=$1
 
 conversational_train=$data/conversational/training/
-audio=$conversational_train/audio/ 
+audio=$conversational_train/audio/
 [ ! -d $audio ] && \
   echo "The directory $audio does not exist!" && exit 1
-[ ! -f data/train/wav2file ] && \
-  echo "File data/train/wav2file not found. Was text preparation run?" && exit 1 
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl <(sort -u data/train/wav2file) > data/train/wav.scp
+  local/audio2wav_scp.pl > data/train/wav.scp
 
 
 conversational_dev=$data/conversational/dev
-audio=$conversational_dev/audio/ 
+audio=$conversational_dev/audio/
 [ ! -d $audio ] && \
   echo "The directory $audio does not exist!" && exit 1
-[ ! -f data/dev/wav2file ] && \
-  echo "File data/dev/wav2file not found. Was text preparation run?" && exit 1 
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl <(sort -u data/dev/wav2file) > data/dev/wav.scp
+  local/audio2wav_scp.pl > data/dev/wav.scp
 
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
index 8e15e4deec5..1ca7e31e920 100755
--- a/egs/material/s5/local/prepare_dict.sh
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -13,7 +13,7 @@ if [ $# -ne 1 ] ; then
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."
   echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
-  exit 
+  exit
 fi
 data=$1
 
@@ -24,8 +24,8 @@ mkdir -p data/local/dict/
 cat data/train/text | cut -f 2- -d ' ' | \
   sed 's/ /\n/g' | sort -u > data/local/dict/wordlist
 
-local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) <(grep -v -F '~' data/local/dict/wordlist) | sort -u > data/local/dict/lexicon.txt
-
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict/wordlist | sort -u > data/local/dict/lexicon.txt
+[ -f  data/local/dict/lexiconp.txt ] && rm data/local/dict/lexiconp.txt
 
 cat data/local/dict/lexicon.txt | sed 's/\t/ /g' | \
   cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict/phones.txt
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
index 42cb5354c6e..eaa046b202a 100755
--- a/egs/material/s5/local/prepare_text_data.sh
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -13,7 +13,7 @@ if [ $# -ne 1 ] ; then
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."
   echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
-  exit 
+  exit
 fi
 data=$1;
 conversational_train=$data/conversational/training/
diff --git a/egs/wsj/s5/utils/validate_dict_dir.pl b/egs/wsj/s5/utils/validate_dict_dir.pl
index 25e45da97e5..d2fbe2ccf29 100755
--- a/egs/wsj/s5/utils/validate_dict_dir.pl
+++ b/egs/wsj/s5/utils/validate_dict_dir.pl
@@ -35,7 +35,7 @@ sub get_utf8_or_bytestream {
       $is_utf_compatible = $is_utf_compatible && defined($decoded_text);
       push @unicode_lines, $decoded_text;
     } else {
-      #print STDERR "WARNING: the line $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
+      #print STDERR "WARNING: the line($.) $raw_text cannot be interpreted as UTF-8: $decoded_text\n";
       ;
     }
     push @raw_lines, $raw_text;

From a71550e8124e5c9894f12c5664a05d2f0017410e Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 6 Nov 2017 14:33:48 -0500
Subject: [PATCH 156/184] do fix_data_dir after parametrixation

---
 egs/material/s5/run.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index 492643def10..f85b4bebcb9 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -10,17 +10,20 @@
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
-corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
 
 local/prepare_text_data.sh $corpus
 local/prepare_audio_data.sh $corpus
 
 utils/fix_data_dir.sh data/train
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/train
+utils/fix_data_dir.sh data/train
 utils/validate_data_dir.sh data/train
 
 utils/fix_data_dir.sh data/dev
 steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/dev
+utils/fix_data_dir.sh data/dev
 utils/validate_data_dir.sh data/dev
 
 local/prepare_dict.sh $corpus

From cc872c67ffebcfc3e68fd53f58211340c542ea75 Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 6 Nov 2017 15:14:22 -0500
Subject: [PATCH 157/184] make 1A language default again

---
 egs/material/s5/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index f85b4bebcb9..a7bc5ca53db 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -10,8 +10,8 @@
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
-#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
-corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
 
 local/prepare_text_data.sh $corpus
 local/prepare_audio_data.sh $corpus

From 678c0d281f9e8b01466ea7ed3cbebdc85db8db8c Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 20 Nov 2017 20:33:54 -0500
Subject: [PATCH 158/184] material:add text filter for scoring

---
 egs/material/s5/local/wer_output_filter | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100755 egs/material/s5/local/wer_output_filter

diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter
new file mode 100755
index 00000000000..aceeeec41b4
--- /dev/null
+++ b/egs/material/s5/local/wer_output_filter
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+while (<>) {
+  @F = split " ";
+  print $F[0] . " "; 
+  foreach $s (@F[1..$#F]) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+      print "";
+    } else {
+      print "$s"
+    }
+    print " ";
+  }
+  print "\n";
+}
+
+

From a9e4a72ae288695cf5632130e6249a53c9277b58 Mon Sep 17 00:00:00 2001
From: Jan Trmal <jtrmal@gmail.com>
Date: Mon, 20 Nov 2017 20:42:37 -0500
Subject: [PATCH 159/184] material: fix path.sh

---
 egs/material/s5/path.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/egs/material/s5/path.sh b/egs/material/s5/path.sh
index 6dd1bcc0a27..ffa108b6737 100644
--- a/egs/material/s5/path.sh
+++ b/egs/material/s5/path.sh
@@ -2,7 +2,6 @@ export KALDI_ROOT=`pwd`/../../..
 export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
 [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
 . $KALDI_ROOT/tools/config/common_path.sh
+[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
+. $KALDI_ROOT/tools/env.sh
 export LC_ALL=C
-
-# For now, don't include any of the optional dependenices of the main
-# librispeech recipe

From 5be4d360da44b3ca983a2c358c3be8a65c9807d0 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Mon, 6 Nov 2017 17:43:58 -0500
Subject: [PATCH 160/184] script changes up to triphone training

---
 egs/material/s5/local/prepare_dict.sh |  22 ++--
 egs/material/s5/run.sh                | 151 +++++++++++++++++++++++---
 2 files changed, 144 insertions(+), 29 deletions(-)

diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
index 1ca7e31e920..d9924d14815 100755
--- a/egs/material/s5/local/prepare_dict.sh
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -20,23 +20,23 @@ data=$1
 lexicon=$data/conversational/reference_materials/lexicon.txt
 [ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
 
-mkdir -p data/local/dict/
+mkdir -p data/local/dict_nosp/
 cat data/train/text | cut -f 2- -d ' ' | \
-  sed 's/ /\n/g' | sort -u > data/local/dict/wordlist
+  sed 's/ /\n/g' | sort -u > data/local/dict_nosp/wordlist
 
-local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict/wordlist | sort -u > data/local/dict/lexicon.txt
-[ -f  data/local/dict/lexiconp.txt ] && rm data/local/dict/lexiconp.txt
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
+[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
 
-cat data/local/dict/lexicon.txt | sed 's/\t/ /g' | \
-  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict/phones.txt
+cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict_nosp/phones.txt
 
 
-grep "^<.*>$" data/local/dict/phones.txt  > data/local/dict/silence_phones.txt
-grep -v "^<.*>$" data/local/dict/phones.txt  > data/local/dict/nonsilence_phones.txt
-echo "<sil>" > data/local/dict/optional_silence.txt
-echo "<unk>" > data/local/dict/oov.txt
+grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/local/dict_nosp/oov.txt
 
 
 
-utils/validate_dict_dir.pl data/local/dict/
+utils/validate_dict_dir.pl data/local/dict_nosp/
 
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index a7bc5ca53db..457c0a38805 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -7,32 +7,147 @@
 . ./path.sh
 . ./cmd.sh
 
+nj=30 # number of parallel jobs
+stage=1
+. utils/parse_options.sh
+
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
 #corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
 
-local/prepare_text_data.sh $corpus
-local/prepare_audio_data.sh $corpus
+if [ $stage -le 1 ]; then
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+if [ $stage -le 3 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp_tmp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+if [ $stage -le 4 ]; then
+  for set in train dev; do
+    dir=data/$set
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
+    steps/compute_cmvn_stats.sh $dir
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+# Create a subset with 40k short segments to make flat-start training easier
+if [ $stage -le 5 ]; then
+  utils/subset_data_dir.sh --shortest data/train 40000 data/train_40kshort
+fi
+
+# monophone training
+if [ $stage -le 6 ]; then
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_40kshort data/lang_nosp_test exp/mono
+  (
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+fi
+ 
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 7 ]; then
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    2000 30000 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 8 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 3000 30000 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+fi
+
+# Train tri3, which is LDA+MLLT+SAT
+if [ $stage -le 9 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" 4000 40000 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 10 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
 
-utils/fix_data_dir.sh data/train
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/train
-utils/fix_data_dir.sh data/train
-utils/validate_data_dir.sh data/train
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
-utils/fix_data_dir.sh data/dev
-steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 data/dev
-utils/fix_data_dir.sh data/dev
-utils/validate_data_dir.sh data/dev
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
 
-local/prepare_dict.sh $corpus
-utils/validate_dict_dir.pl data/local/dict
-utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-utils/validate_lang.pl data/lang
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+fi
 
-local/train_lms_srilm.sh --oov-symbol "<unk>" data data/lm
-utils/format_lm.sh data/lang data/lm/lm.gz \
-  data/local/dict/lexiconp.txt data/lang_test
-utils/validate_lang.pl data/lang_test
+if [ $stage -le 11 ]; then
+  # Test the tri3 system with the silprobs and pron-probs.
 
+  # decode using the tri3 model
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  for test in dev; do
+    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+      exp/tri3/graph data/$test exp/tri3/decode_$test
+  done
+fi

From cc1f902ef33b84bdc5c84338af84aebbc3680356 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 21 Nov 2017 21:34:38 -0500
Subject: [PATCH 161/184] tuning of triphone systems

---
 egs/material/s5/run.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index 457c0a38805..30f38488337 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -32,7 +32,7 @@ fi
 
 if [ $stage -le 3 ]; then
   local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
-    data/lang_nosp_tmp/words.txt data data/lm
+    data/lang_nosp/words.txt data data/lm
   utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
     data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
   utils/validate_lang.pl data/lang_nosp_test
@@ -70,7 +70,7 @@ if [ $stage -le 6 ]; then
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
     data/train data/lang_nosp_test exp/mono exp/mono_ali
 fi
- 
+
 # train a first delta + delta-delta triphone system on all utterances
 if [ $stage -le 7 ]; then
   steps/train_deltas.sh --cmd "$train_cmd" \
@@ -92,7 +92,7 @@ fi
 # train an LDA+MLLT system.
 if [ $stage -le 8 ]; then
   steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=3 --right-context=3" 3000 30000 \
+    --splice-opts "--left-context=3 --right-context=3" 3000 60000 \
     data/train data/lang_nosp_test exp/tri1_ali exp/tri2
 
   # decode using the LDA+MLLT model
@@ -110,7 +110,7 @@ fi
 
 # Train tri3, which is LDA+MLLT+SAT
 if [ $stage -le 9 ]; then
-  steps/train_sat.sh --cmd "$train_cmd" 4000 40000 \
+  steps/train_sat.sh --cmd "$train_cmd" 6000 80000 \
     data/train data/lang_nosp_test exp/tri2_ali exp/tri3
 
   # decode using the tri3 model

From 3a19f0f671098b45cf67d185a87548174bd5004a Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Wed, 13 Dec 2017 14:23:36 -0500
Subject: [PATCH 162/184] added recipe for tagalog

---
 egs/material/s5/local/chain/run_tdnn_lstm.sh  |   1 +
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 274 ++++++++++++++++++
 egs/material/s5_tagalog/cmd.sh                |  14 +
 egs/material/s5_tagalog/conf                  |   1 +
 egs/material/s5_tagalog/path.sh               |   7 +
 egs/material/s5_tagalog/run.sh                | 155 ++++++++++
 6 files changed, 452 insertions(+)
 create mode 120000 egs/material/s5/local/chain/run_tdnn_lstm.sh
 create mode 100755 egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
 create mode 100644 egs/material/s5_tagalog/cmd.sh
 create mode 120000 egs/material/s5_tagalog/conf
 create mode 100644 egs/material/s5_tagalog/path.sh
 create mode 100755 egs/material/s5_tagalog/run.sh

diff --git a/egs/material/s5/local/chain/run_tdnn_lstm.sh b/egs/material/s5/local/chain/run_tdnn_lstm.sh
new file mode 120000
index 00000000000..8e647598556
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn_lstm.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_lstm_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
new file mode 100755
index 00000000000..4d06d9f6a11
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -0,0 +1,274 @@
+#!/bin/bash
+
+# 1e is as 1d but instead of the --proportional-shrink option, using
+#  the newly added xconfig-layer-specific 'l2-regularize' options.
+
+# local/chain/compare_wer.sh exp/chain/tdnn1d_sp exp/chain/tdnn1e_sp
+# System                tdnn1d_sp tdnn1e_sp
+#WER dev_clean_2 (tgsmall)      14.21     13.43
+#WER dev_clean_2 (tglarge)      10.41      9.76
+# Final train prob        -0.0473   -0.0510
+# Final valid prob        -0.0893   -0.0889
+# Final train prob (xent)   -1.0757   -1.4148
+# Final valid prob (xent)   -1.4222   -1.6640
+
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1{d,e}_sp
+# exp/chain/tdnn1d_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.063->-0.052 xent:train/valid[10,16,final]=(-1.65,-1.23,-1.08/-1.91,-1.55,-1.42) logprob:train/valid[10,16,final]=(-0.084,-0.057,-0.047/-0.125,-0.100,-0.089)
+# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.061->-0.056 xent:train/valid[10,16,final]=(-1.69,-1.41,-1.41/-1.91,-1.67,-1.66) logprob:train/valid[10,16,final]=(-0.065,-0.055,-0.051/-0.104,-0.095,-0.089)
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+decode_nj=30
+train_set=train
+test_sets=dev
+gmm=tri3
+nnet3_affix=
+
+# The rest are configs specific to this script.  Most of the parameters
+# are just hardcoded at this level, in the commands below.
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+tree_affix=
+train_stage=-10
+get_egs_stage=-10
+decode_iter=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+common_egs_dir=
+xent_regularize=0.1
+
+# training options
+srand=0
+remove_egs=true
+reporting_email=
+
+#decode options
+test_online_decoding=true  # if true, it will run the last decoding stage.
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 11" if you have already
+# run those things.
+local/nnet3/run_ivector_common.sh --stage $stage \
+                                  --train-set $train_set \
+                                  --gmm $gmm \
+                                  --nnet3-affix "$nnet3_affix" || exit 1;
+
+# Problem: We have removed the "train_" prefix of our training set in
+# the alignment directory names! Bad!
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 10 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 11 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang_test $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 12 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+if [ $stage -le 13 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  tdnn_opts="l2-regularize=0.02"
+  lstm_opts="l2-regularize=0.005"
+  output_opts="l2-regularize=0.004"
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-layer name=tdnn1 $tdnn_opts dim=512
+  relu-batchnorm-layer name=tdnn2 $tdnn_opts input=Append(-1,0,1) dim=512
+  relu-batchnorm-layer name=tdnn3 $tdnn_opts input=Append(-1,0,1) dim=512
+
+  # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults
+  fast-lstmp-layer name=lstm1 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn4 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn5 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn6 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm2 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+  relu-batchnorm-layer name=tdnn7 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn8 $tdnn_opts input=Append(-3,0,3) dim=512
+  relu-batchnorm-layer name=tdnn9 $tdnn_opts input=Append(-3,0,3) dim=512
+  fast-lstmp-layer name=lstm3 cell-dim=512 recurrent-projection-dim=128 non-recurrent-projection-dim=128 delay=-3 $lstm_opts
+
+  ## adding the layers for chain branch
+  output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' models... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 14 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=5 \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --trainer.deriv-truncate-margin=8 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 15 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+fi
+
+if [ $stage -le 16 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context $chunk_left_context \
+          --extra-right-context $chunk_right_context \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5_tagalog/cmd.sh b/egs/material/s5_tagalog/cmd.sh
new file mode 100644
index 00000000000..811adcde474
--- /dev/null
+++ b/egs/material/s5_tagalog/cmd.sh
@@ -0,0 +1,14 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/material/s5_tagalog/conf b/egs/material/s5_tagalog/conf
new file mode 120000
index 00000000000..2b5d689c5e2
--- /dev/null
+++ b/egs/material/s5_tagalog/conf
@@ -0,0 +1 @@
+../s5/conf
\ No newline at end of file
diff --git a/egs/material/s5_tagalog/path.sh b/egs/material/s5_tagalog/path.sh
new file mode 100644
index 00000000000..ffa108b6737
--- /dev/null
+++ b/egs/material/s5_tagalog/path.sh
@@ -0,0 +1,7 @@
+export KALDI_ROOT=`pwd`/../../..
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
+. $KALDI_ROOT/tools/env.sh
+export LC_ALL=C
diff --git a/egs/material/s5_tagalog/run.sh b/egs/material/s5_tagalog/run.sh
new file mode 100755
index 00000000000..08c115a36d2
--- /dev/null
+++ b/egs/material/s5_tagalog/run.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+# License: Apache 2.0
+
+# Begin configuration section.
+# End configuration section
+. ./path.sh
+. ./cmd.sh
+
+nj=30 # number of parallel jobs
+stage=1
+. utils/parse_options.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
+
+if [ $stage -le 1 ]; then
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
+fi
+
+if [ $stage -le 2 ]; then
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
+fi
+
+if [ $stage -le 3 ]; then
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
+fi
+
+if [ $stage -le 4 ]; then
+  for set in train dev; do
+    dir=data/$set
+    utils/fix_data_dir.sh $dir
+    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
+    steps/compute_cmvn_stats.sh $dir
+    utils/fix_data_dir.sh $dir
+    utils/validate_data_dir.sh $dir
+  done
+fi
+
+# Create a subset with 40k short segments to make flat-start training easier
+if [ $stage -le 5 ]; then
+  utils/subset_data_dir.sh --shortest data/train 45000 data/train_45kshort
+fi
+
+# monophone training
+if [ $stage -le 6 ]; then
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_45kshort data/lang_nosp_test exp/mono
+  (
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+fi
+
+# train a first delta + delta-delta triphone system on all utterances
+if [ $stage -le 7 ]; then
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    4000 60000 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+
+  # decode using the tri1 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+fi
+
+# train an LDA+MLLT system.
+if [ $stage -le 8 ]; then
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=3 --right-context=3" 5000 80000 \
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+
+  # decode using the LDA+MLLT model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    for test in dev; do
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
+    done
+  )&
+
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+fi
+
+# Train tri3, which is LDA+MLLT+SAT
+if [ $stage -le 9 ]; then
+  steps/train_sat.sh --cmd "$train_cmd" 7000 100000 \
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+
+  # decode using the tri3 model
+  (
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    for test in dev; do
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
+    done
+  )&
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory.
+if [ $stage -le 10 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
+
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang_test exp/tri3 exp/tri3_ali
+fi
+
+if [ $stage -le 11 ]; then
+  # Test the tri3 system with the silprobs and pron-probs.
+
+  # decode using the tri3 model
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  for test in dev; do
+    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
+      exp/tri3/graph data/$test exp/tri3/decode_$test
+  done
+fi

From 2ef9a10df88c219f3ac758dd1435d8c921d3070c Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Wed, 13 Dec 2017 14:28:55 -0500
Subject: [PATCH 163/184] added more

---
 .../s5/local/nnet3/run_ivector_common.sh      | 149 ++++++++++++++++++
 1 file changed, 149 insertions(+)
 create mode 100755 egs/material/s5/local/nnet3/run_ivector_common.sh

diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
new file mode 100755
index 00000000000..a56b3bf67d8
--- /dev/null
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -0,0 +1,149 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# This script is called from local/nnet3/run_tdnn.sh and
+# local/chain/run_tdnn.sh (and may eventually be called by more
+# scripts).  It contains the common feature preparation and
+# iVector-related parts of the script.  See those scripts for examples
+# of usage.
+
+stage=0
+train_set=train
+test_sets="dev"
+nj=30
+gmm=tri3
+
+nnet3_affix=
+
+. ./cmd.sh
+. ./path.sh
+. utils/parse_options.sh
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+  if [ ! -f $f ]; then
+    echo "$0: expected file $f to exist"
+    exit 1
+  fi
+done
+
+if [ $stage -le 1 ]; then
+  # Although the nnet will be trained by high resolution data, we still have to
+  # perturb the normal data to get the alignment _sp stands for speed-perturbed
+  echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  echo "$0: making MFCC features for low-resolution speed-perturbed data"
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: aligning with the perturbed low-resolution data"
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
+fi
+
+if [ $stage -le 3 ]; then
+  # Create high-resolution MFCC features (with 40 cepstra instead of 13).
+  # this shows how you can split across multiple file-systems.
+  echo "$0: creating high-resolution MFCC features"
+  mfccdir=data/${train_set}_sp_hires/data
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
+    utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
+  fi
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+  done
+
+  # do volume-perturbation on the training data prior to extracting hires
+  # features; this helps make trained nnets more invariant to test data volume.
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+
+  for datadir in ${train_set}_sp ${test_sets}; do
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+  done
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: computing a subset of data to train the diagonal UBM."
+  # We'll use about a quarter of the data.
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts=$[$num_utts_total/4]
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+     $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
+
+  echo "$0: computing a PCA transform from the hires data."
+  steps/online/nnet2/get_pca_transform.sh --cmd "$train_cmd" \
+      --splice-opts "--left-context=3 --right-context=3" \
+      --max-utts 10000 --subsample 2 \
+       ${temp_data_root}/${train_set}_sp_hires_subset \
+       exp/nnet3${nnet3_affix}/pca_transform
+
+  echo "$0: training the diagonal UBM."
+  # Use 512 Gaussians in the UBM.
+  steps/online/nnet2/train_diag_ubm.sh --cmd "$train_cmd" --nj 30 \
+    --num-frames 700000 \
+    --num-threads 32 \
+    ${temp_data_root}/${train_set}_sp_hires_subset 512 \
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+fi
+
+if [ $stage -le 5 ]; then
+  # Train the iVector extractor.  Use all of the speed-perturbed data since iVector extractors
+  # can be sensitive to the amount of data.  The script defaults to an iVector dimension of
+  # 100.
+  echo "$0: training the iVector extractor"
+  steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
+fi
+
+
+if [ $stage -le 6 ]; then
+  # We extract iVectors on the speed-perturbed training data after combining
+  # short segments, which will be what we train the system on.  With
+  # --utts-per-spk-max 2, the script pairs the utterances into twos, and treats
+  # each of these pairs as one speaker; this gives more diversity in iVectors..
+  # Note that these are extracted 'online'.
+
+  # note, we don't encode the 'max2' in the name of the ivectordir even though
+  # that's the data we extract the ivectors from, as it's still going to be
+  # valid for the non-'max2' data, the utterance list is the same.
+
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
+    utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
+  fi
+
+
+  # having a larger number of speakers is helpful for generalization, and to
+  # handle per-utterance decoding well (iVector starts at zero).
+  temp_data_root=${ivectordir}
+  utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    ${temp_data_root}/${train_set}_sp_hires_max2 \
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
+
+  # Also extract iVectors for the test data, but in this case we don't need the speed
+  # perturbation (sp).
+  for data in $test_sets; do
+    steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+  done
+fi
+
+exit 0

From 0f2ef8082a13425f34ad35c2bbb17205d880dcfe Mon Sep 17 00:00:00 2001
From: Hainan Xu <hainan.xv@gmail.com>
Date: Sat, 6 Jan 2018 10:12:56 -0500
Subject: [PATCH 164/184] RNNLM for material

---
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh  | 121 ++++++++++++++++++
 .../s5_tagalog/local/rnnlm/run_tdnn_lstm.sh   | 121 ++++++++++++++++++
 scripts/rnnlm/prepare_rnnlm_dir.sh            |   2 +-
 3 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100755 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
 create mode 100755 egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh

diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..586b0bfb5aa
--- /dev/null
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 3, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 105.1 / 223.6.
+# Train objf: -5.72 -5.28 -4.92 -4.64 -4.36 -4.09 -3.85 -3.62 -3.40 -3.23 
+# Dev objf:   -9.99 -5.71 -5.43 -5.41 -5.52 -5.69 -5.86 -6.09 -6.29 -6.49 
+
+# %WER 39.14 [ 24322 / 62144, 3199 ins, 6127 del, 14996 sub ] exp/chain/tdnn_lstm1a_tree6000_sp_ld5/decode_dev/wer_10_0.0
+# %WER 37.60 [ 23367 / 62144, 3129 ins, 5918 del, 14320 sub ] exp/chain/tdnn_lstm1a_tree6000_sp_ld5/decode_dev_rnnlm_1a/wer_9_0.5
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=512
+lstm_rpd=128
+lstm_nrpd=128
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_rescore=true
+ac_model_dir=exp/chain/tdnn_lstm1a_tree6000_sp_ld5
+decode_dir_suffix=rnnlm_1a
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp_1e
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 50 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/train.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   10   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding_l2 0.01 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+LM=test
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in dev; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}
+
+    # Lattice rescoring
+    rnnlm/lmrescore$pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix}
+  done
+fi
+
+exit 0
diff --git a/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh
new file mode 100755
index 00000000000..877569c3501
--- /dev/null
+++ b/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+
+# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
+#           2017  Hainan Xu
+
+# This script trains LMs on the swbd LM-training data.
+
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 3, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 65.0 / 106.6.
+# Train objf: -5.17 -4.66 -4.37 -4.17 -3.99 -3.82 -3.66 -3.52 -3.39 -3.27 
+# Dev objf:   -9.55 -5.02 -4.73 -4.67 -4.69 -4.77 -4.90 -5.02 -5.16 -5.29 
+
+# %WER 46.51 [ 29942 / 64382, 3413 ins, 9336 del, 17193 sub ] exp/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_0.5
+# %WER 44.67 [ 28762 / 64382, 3276 ins, 9266 del, 16220 sub ] exp/chain/tdnn_lstm1a_sp_ld5/decode_dev_rnnlm_1a/wer_9_0.5
+
+# Begin configuration section.
+
+dir=exp/rnnlm_lstm_1a
+embedding_dim=256
+lstm_rpd=64
+lstm_nrpd=64
+stage=-10
+train_stage=-10
+
+# variables for lattice rescoring
+run_rescore=true
+ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
+decode_dir_suffix=rnnlm_1a
+ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
+              # if it's set, it merges histories in the lattice if they share
+              # the same ngram history and this prevents the lattice from 
+              # exploding exponentially
+pruned_rescore=true
+
+. ./cmd.sh
+. ./utils/parse_options.sh
+
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp_1e
+mkdir -p $dir/config
+set -e
+
+for f in $text $lexicon; do
+  [ ! -f $f ] && \
+    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+done
+
+if [ $stage -le 0 ]; then
+  mkdir -p $text_dir
+  echo -n >$text_dir/dev.txt
+  # hold out one in every 50 lines as dev data.
+  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/train.txt
+fi
+
+if [ $stage -le 1 ]; then
+  cp data/lang/words.txt $dir/config/
+  n=`cat $dir/config/words.txt | wc -l`
+  echo "<brk> $n" >> $dir/config/words.txt
+
+  # words that are not present in words.txt but are in the training or dev data, will be
+  # mapped to <SPOKEN_NOISE> during training.
+  echo "<unk>" >$dir/config/oov.txt
+
+  cat > $dir/config/data_weights.txt <<EOF
+train   10   1.0
+EOF
+
+  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
+                             --unk-word="<unk>" \
+                             --data-weights-file=$dir/config/data_weights.txt \
+                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
+
+  # choose features
+  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
+                           --use-constant-feature=true \
+                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
+                           $dir/config/words.txt > $dir/config/features.txt
+
+  cat >$dir/config/xconfig <<EOF
+input dim=$embedding_dim name=input
+relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
+fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
+fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
+relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
+output-layer name=output include-log-softmax=false dim=$embedding_dim
+EOF
+  rnnlm/validate_config_dir.sh $text_dir $dir/config
+fi
+
+if [ $stage -le 2 ]; then
+  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
+fi
+
+if [ $stage -le 3 ]; then
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 \
+                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+fi
+
+LM=test
+if [ $stage -le 4 ] && $run_rescore; then
+  echo "$0: Perform lattice-rescoring on $ac_model_dir"
+  pruned=
+  if $pruned_rescore; then
+    pruned=_pruned
+  fi
+  for decode_set in dev; do
+    decode_dir=${ac_model_dir}/decode_${decode_set}
+
+    # Lattice rescoring
+    rnnlm/lmrescore$pruned.sh \
+      --cmd "$decode_cmd --mem 4G" \
+      --weight 0.5 --max-ngram-order $ngram_order \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
+      ${decode_dir}_${decode_dir_suffix}
+  done
+fi
+
+exit 0
diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index 1de91bb7232..4cb33d99414 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -58,7 +58,7 @@ if [ $stage -le 1 ]; then
     done
   fi
 
-  rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
+  cat $dir/config/words.txt | grep '<' | rnnlm/get_special_symbol_opts.py > $dir/special_symbol_opts.txt
 fi
 
 if [ $stage -le 2 ]; then

From 3da8203be15b17c9bd8fca4d2813236f0ed44ef9 Mon Sep 17 00:00:00 2001
From: Hainan Xu <hainan.xv@gmail.com>
Date: Sat, 6 Jan 2018 17:59:32 -0500
Subject: [PATCH 165/184] change chain model path

---
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
index 586b0bfb5aa..c541a2f660b 100755
--- a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -24,7 +24,7 @@ train_stage=-10
 
 # variables for lattice rescoring
 run_rescore=true
-ac_model_dir=exp/chain/tdnn_lstm1a_tree6000_sp_ld5
+ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
 decode_dir_suffix=rnnlm_1a
 ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
               # if it's set, it merges histories in the lattice if they share

From fbb14a612bc0b119313555c9642ba80eb219cfdb Mon Sep 17 00:00:00 2001
From: Hainan Xu <hainan.xv@gmail.com>
Date: Sat, 13 Jan 2018 18:32:54 -0500
Subject: [PATCH 166/184] minor change

---
 scripts/rnnlm/prepare_rnnlm_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index 4cb33d99414..d85fb99d6c6 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -58,7 +58,7 @@ if [ $stage -le 1 ]; then
     done
   fi
 
-  cat $dir/config/words.txt | grep '<' | rnnlm/get_special_symbol_opts.py > $dir/special_symbol_opts.txt
+  cat $dir/config/words.txt | rnnlm/get_special_symbol_opts.py > $dir/special_symbol_opts.txt
 fi
 
 if [ $stage -le 2 ]; then

From d3f72df49da565d1a0b69b538616b753bf478db5 Mon Sep 17 00:00:00 2001
From: Hainan Xu <hainan.xv@gmail.com>
Date: Sat, 13 Jan 2018 18:35:06 -0500
Subject: [PATCH 167/184] minor change

---
 scripts/rnnlm/prepare_rnnlm_dir.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh
index d85fb99d6c6..1de91bb7232 100755
--- a/scripts/rnnlm/prepare_rnnlm_dir.sh
+++ b/scripts/rnnlm/prepare_rnnlm_dir.sh
@@ -58,7 +58,7 @@ if [ $stage -le 1 ]; then
     done
   fi
 
-  cat $dir/config/words.txt | rnnlm/get_special_symbol_opts.py > $dir/special_symbol_opts.txt
+  rnnlm/get_special_symbol_opts.py < $dir/config/words.txt > $dir/special_symbol_opts.txt
 fi
 
 if [ $stage -le 2 ]; then

From 28e815f8b742f7736991293ed045bb964db19ac5 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Tue, 16 Jan 2018 11:19:44 -0500
Subject: [PATCH 168/184] reoriganized the scripts structures to allow to
 specifying language names instead of putting different languages into
 different s5* subdirs

---
 egs/material/s5/conf/lang/swahili.conf        |  13 ++
 egs/material/s5/conf/lang/tagalog.conf        |  13 ++
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |  34 ++--
 .../s5/local/nnet3/run_ivector_common.sh      |  55 ++++---
 egs/material/s5/local/prepare_audio_data.sh   |  11 +-
 egs/material/s5/local/prepare_dict.sh         |  31 ++--
 egs/material/s5/local/prepare_text_data.sh    |  23 +--
 egs/material/s5/run.sh                        |  95 +++++------
 egs/material/s5_tagalog/cmd.sh                |  14 --
 egs/material/s5_tagalog/conf                  |   1 -
 egs/material/s5_tagalog/path.sh               |   7 -
 egs/material/s5_tagalog/run.sh                | 155 ------------------
 12 files changed, 154 insertions(+), 298 deletions(-)
 create mode 100644 egs/material/s5/conf/lang/swahili.conf
 create mode 100644 egs/material/s5/conf/lang/tagalog.conf
 delete mode 100644 egs/material/s5_tagalog/cmd.sh
 delete mode 120000 egs/material/s5_tagalog/conf
 delete mode 100644 egs/material/s5_tagalog/path.sh
 delete mode 100755 egs/material/s5_tagalog/run.sh

diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
new file mode 100644
index 00000000000..d317a51148e
--- /dev/null
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -0,0 +1,13 @@
+#speech corpora files location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+
+# Acoustic model parameters
+numShorestUtts=40000
+numLeavesTri1=2000
+numGaussTri1=30000
+numLeavesTri2=3000
+numGaussTri2=60000
+numLeavesTri3=6000
+numGaussTri3=80000
+
+
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
new file mode 100644
index 00000000000..ef622e941f9
--- /dev/null
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -0,0 +1,13 @@
+#speech corpora files location
+corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
+
+# Acoustic model parameters
+numShorestUtts=45000
+numLeavesTri1=4000
+numGaussTri1=60000
+numLeavesTri2=5000
+numGaussTri2=80000
+numLeavesTri3=7000
+numGaussTri3=100000
+
+
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 4d06d9f6a11..bb1be7ec4b2 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -27,6 +27,7 @@ decode_nj=30
 train_set=train
 test_sets=dev
 gmm=tri3
+language=swahili
 nnet3_affix=
 
 # The rest are configs specific to this script.  Most of the parameters
@@ -76,20 +77,21 @@ fi
 local/nnet3/run_ivector_common.sh --stage $stage \
                                   --train-set $train_set \
                                   --gmm $gmm \
+                                  --language $language \
                                   --nnet3-affix "$nnet3_affix" || exit 1;
 
 # Problem: We have removed the "train_" prefix of our training set in
 # the alignment directory names! Bad!
-gmm_dir=exp/$gmm
-ali_dir=exp/${gmm}_ali_${train_set}_sp
-tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
-lang=data/lang_chain
-lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+gmm_dir=exp/$language/$gmm
+ali_dir=exp/$language/${gmm}_ali_${train_set}_sp
+tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/$language/lang_chain
+lat_dir=exp/$language/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
 if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
-train_data_dir=data/${train_set}_sp_hires
-lores_train_data_dir=data/${train_set}_sp
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+train_data_dir=data/$language/${train_set}_sp_hires
+lores_train_data_dir=data/$language/${train_set}_sp
+train_ivector_dir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
 
 for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
     $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
@@ -102,15 +104,15 @@ if [ $stage -le 10 ]; then
   # topo file. [note, it really has two states.. the first one is only repeated
   # once, the second one has zero or more repeats.]
   if [ -d $lang ]; then
-    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+    if [ $lang/L.fst -nt data/$language/lang_test/L.fst ]; then
       echo "$0: $lang already exists, not overwriting it; continuing"
     else
-      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo "$0: $lang already exists and seems to be older than data/$language/lang..."
       echo " ... not sure what to do.  Exiting."
       exit 1;
     fi
   else
-    cp -r data/lang_test $lang
+    cp -r data/$language/lang_test $lang
     silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
     nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
     # Use our special topology... note that later on may have to tune this
@@ -123,7 +125,7 @@ if [ $stage -le 11 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/lang_test $gmm_dir $lat_dir
+    data/$language/lang_test $gmm_dir $lat_dir
   rm $lat_dir/fsts.*.gz # save space
 fi
 
@@ -244,7 +246,7 @@ if [ $stage -le 15 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/lang_test \
+    --self-loop-scale 1.0 data/$language/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
@@ -263,8 +265,8 @@ if [ $stage -le 16 ]; then
           --extra-right-context-final 0 \
           --frames-per-chunk $frames_per_chunk \
           --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+          --online-ivector-dir exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph data/$language/${data}_hires ${dir}/decode_${data} || exit 1
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
index a56b3bf67d8..29be7925c79 100755
--- a/egs/material/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -13,6 +13,7 @@ train_set=train
 test_sets="dev"
 nj=30
 gmm=tri3
+language=swahili
 
 nnet3_affix=
 
@@ -20,10 +21,10 @@ nnet3_affix=
 . ./path.sh
 . utils/parse_options.sh
 
-gmm_dir=exp/${gmm}
-ali_dir=exp/${gmm}_ali_${train_set}_sp
+gmm_dir=exp/$language/${gmm}
+ali_dir=exp/$language/${gmm}_ali_${train_set}_sp
 
-for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+for f in data/$language/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -34,53 +35,53 @@ if [ $stage -le 1 ]; then
   # Although the nnet will be trained by high resolution data, we still have to
   # perturb the normal data to get the alignment _sp stands for speed-perturbed
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
+  utils/data/perturb_data_dir_speed_3way.sh data/$language/${train_set} data/$language/${train_set}_sp
   echo "$0: making MFCC features for low-resolution speed-perturbed data"
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
-  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
-  utils/fix_data_dir.sh data/${train_set}_sp
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$language/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/$language/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/$language/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
   echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
+    data/$language/${train_set}_sp data/$language/lang_test $gmm_dir $ali_dir || exit 1
 fi
 
 if [ $stage -le 3 ]; then
   # Create high-resolution MFCC features (with 40 cepstra instead of 13).
   # this shows how you can split across multiple file-systems.
   echo "$0: creating high-resolution MFCC features"
-  mfccdir=data/${train_set}_sp_hires/data
+  mfccdir=data/$language/${train_set}_sp_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
   for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
+    utils/copy_data_dir.sh data/$language/$datadir data/$language/${datadir}_hires
   done
 
   # do volume-perturbation on the training data prior to extracting hires
   # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
+  utils/data/perturb_data_dir_volume.sh data/$language/${train_set}_sp_hires || exit 1;
 
   for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
-    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
-    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
+      --cmd "$train_cmd" data/$language/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/$language/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/$language/${datadir}_hires || exit 1;
   done
 fi
 
 if [ $stage -le 4 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
   # We'll use about a quarter of the data.
-  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
-  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
+  mkdir -p exp/$language/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/$language/nnet3${nnet3_affix}/diag_ubm
 
-  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
+  num_utts_total=$(wc -l <data/$language/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
-  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
+  utils/data/subset_data_dir.sh data/$language/${train_set}_sp_hires \
      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
   echo "$0: computing a PCA transform from the hires data."
@@ -88,7 +89,7 @@ if [ $stage -le 4 ]; then
       --splice-opts "--left-context=3 --right-context=3" \
       --max-utts 10000 --subsample 2 \
        ${temp_data_root}/${train_set}_sp_hires_subset \
-       exp/nnet3${nnet3_affix}/pca_transform
+       exp/$language/nnet3${nnet3_affix}/pca_transform
 
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
@@ -96,7 +97,7 @@ if [ $stage -le 4 ]; then
     --num-frames 700000 \
     --num-threads 32 \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
+    exp/$language/nnet3${nnet3_affix}/pca_transform exp/$language/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 5 ]; then
@@ -105,8 +106,8 @@ if [ $stage -le 5 ]; then
   # 100.
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
-     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
-     exp/nnet3${nnet3_affix}/extractor || exit 1;
+     data/$language/${train_set}_sp_hires exp/$language/nnet3${nnet3_affix}/diag_ubm \
+     exp/$language/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 
@@ -121,7 +122,7 @@ if [ $stage -le 6 ]; then
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data, the utterance list is the same.
 
-  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  ivectordir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
@@ -131,18 +132,18 @@ if [ $stage -le 6 ]; then
   # handle per-utterance decoding well (iVector starts at zero).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+    data/$language/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
     ${temp_data_root}/${train_set}_sp_hires_max2 \
-    exp/nnet3${nnet3_affix}/extractor $ivectordir
+    exp/$language/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
   # perturbation (sp).
   for data in $test_sets; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
+      data/$language/${data}_hires exp/$language/nnet3${nnet3_affix}/extractor \
+      exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
index 2bf9283f435..d78e261e999 100755
--- a/egs/material/s5/local/prepare_audio_data.sh
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -8,14 +8,15 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 1 ] ; then
+if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus>"
+  echo "  $0 <path-to-material-corpus> <language-name>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
   exit
 fi
 data=$1
+language=$2
 
 conversational_train=$data/conversational/training/
 audio=$conversational_train/audio/
@@ -23,7 +24,7 @@ audio=$conversational_train/audio/
   echo "The directory $audio does not exist!" && exit 1
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl > data/train/wav.scp
+  local/audio2wav_scp.pl > data/$language/train/wav.scp
 
 
 conversational_dev=$data/conversational/dev
@@ -32,5 +33,5 @@ audio=$conversational_dev/audio/
   echo "The directory $audio does not exist!" && exit 1
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl > data/dev/wav.scp
+  local/audio2wav_scp.pl > data/$language/dev/wav.scp
 
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
index d9924d14815..dcfdb8f051e 100755
--- a/egs/material/s5/local/prepare_dict.sh
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -8,35 +8,36 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 1 ] ; then
+if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus>"
+  echo "  $0 <path-to-material-corpus> <language-name>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
   exit
 fi
 data=$1
+language=$2
 
 lexicon=$data/conversational/reference_materials/lexicon.txt
 [ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
 
-mkdir -p data/local/dict_nosp/
-cat data/train/text | cut -f 2- -d ' ' | \
-  sed 's/ /\n/g' | sort -u > data/local/dict_nosp/wordlist
+mkdir -p data/$language/local/dict_nosp/
+cat data/$language/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | sort -u > data/$language/local/dict_nosp/wordlist
 
-local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
-[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/$language/local/dict_nosp/wordlist | sort -u > data/$language/local/dict_nosp/lexicon.txt
+[ -f  data/$language/local/dict_nosp/lexiconp.txt ] && rm data/$language/local/dict_nosp/lexiconp.txt
 
-cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
-  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict_nosp/phones.txt
+cat data/$language/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/$language/local/dict_nosp/phones.txt
 
 
-grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
-grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
-echo "<sil>" > data/local/dict_nosp/optional_silence.txt
-echo "<unk>" > data/local/dict_nosp/oov.txt
+grep "^<.*>$" data/$language/local/dict_nosp/phones.txt  > data/$language/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/$language/local/dict_nosp/phones.txt  > data/$language/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/$language/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/$language/local/dict_nosp/oov.txt
 
 
 
-utils/validate_dict_dir.pl data/local/dict_nosp/
+utils/validate_dict_dir.pl data/$language/local/dict_nosp/
 
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
index eaa046b202a..f8981bac840 100755
--- a/egs/material/s5/local/prepare_text_data.sh
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -8,35 +8,36 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 1 ] ; then
+if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus>"
+  echo "  $0 <path-to-material-corpus> <language-name>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
   exit
 fi
 data=$1;
+language=$2
 conversational_train=$data/conversational/training/
-mkdir -p data/train/
+mkdir -p data/$language/train/
 for file in $conversational_train/transcription/*txt ; do
   ./local/parse_transcripts.pl $file
-done  > data/train/transcripts.txt
+done  > data/$language/train/transcripts.txt
 
 
 conversational_dev=$data/conversational/dev/
-mkdir -p data/dev
+mkdir -p data/$language/dev
 for file in $conversational_dev/transcription/*txt ; do
   ./local/parse_transcripts.pl $file
-done > data/dev/transcripts.txt
+done > data/$language/dev/transcripts.txt
 
 
-cat data/train/transcripts.txt | \
+cat data/$language/train/transcripts.txt | \
   local/cleanup_transcripts.pl | \
-  local/create_datafiles.pl data/train/
+  local/create_datafiles.pl data/$language/train/
 
-cat data/dev/transcripts.txt | \
+cat data/$language/dev/transcripts.txt | \
   local/cleanup_transcripts.pl | \
-  local/create_datafiles.pl data/dev/
+  local/create_datafiles.pl data/$language/dev/
 
 
 
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index 30f38488337..8352e7340da 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -9,38 +9,39 @@
 
 nj=30 # number of parallel jobs
 stage=1
+language=swahili
 . utils/parse_options.sh
 
 set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
-corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
-#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
+[ ! -f ./conf/lang/${language}.conf ] && echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+. ./conf/lang/${language}.conf
 
 if [ $stage -le 1 ]; then
-  local/prepare_text_data.sh $corpus
-  local/prepare_audio_data.sh $corpus
+  local/prepare_text_data.sh $corpus $language
+  local/prepare_audio_data.sh $corpus $language
 fi
 
 if [ $stage -le 2 ]; then
-  local/prepare_dict.sh $corpus
-  utils/validate_dict_dir.pl data/local/dict_nosp
-  utils/prepare_lang.sh data/local/dict_nosp \
-    "<unk>" data/local/lang_nosp data/lang_nosp
-  utils/validate_lang.pl data/lang_nosp
+  local/prepare_dict.sh $corpus $language
+  utils/validate_dict_dir.pl data/$language/local/dict_nosp
+  utils/prepare_lang.sh data/$language/local/dict_nosp \
+    "<unk>" data/$language/local/lang_nosp data/$language/lang_nosp
+  utils/validate_lang.pl data/$language/lang_nosp
 fi
 
 if [ $stage -le 3 ]; then
   local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
-    data/lang_nosp/words.txt data data/lm
-  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
-    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
-  utils/validate_lang.pl data/lang_nosp_test
+    data/$language/lang_nosp/words.txt data/$language data/$language/lm
+  utils/format_lm.sh data/$language/lang_nosp data/$language/lm/lm.gz \
+    data/$language/local/dict_nosp/lexiconp.txt data/$language/lang_nosp_test
+  utils/validate_lang.pl data/$language/lang_nosp_test
 fi
 
 if [ $stage -le 4 ]; then
   for set in train dev; do
-    dir=data/$set
+    dir=data/$language/$set
     utils/fix_data_dir.sh $dir
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
     steps/compute_cmvn_stats.sh $dir
@@ -51,74 +52,74 @@ fi
 
 # Create a subset with 40k short segments to make flat-start training easier
 if [ $stage -le 5 ]; then
-  utils/subset_data_dir.sh --shortest data/train 40000 data/train_40kshort
+  utils/subset_data_dir.sh --shortest data/$language/train $numShorestUtts data/$language/train_short
 fi
 
 # monophone training
 if [ $stage -le 6 ]; then
   steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
-    data/train_40kshort data/lang_nosp_test exp/mono
+    data/$language/train_short data/$language/lang_nosp_test exp/$language/mono
   (
-    utils/mkgraph.sh data/lang_nosp_test \
-      exp/mono exp/mono/graph_nosp
+    utils/mkgraph.sh data/$language/lang_nosp_test \
+      exp/$language/mono exp/$language/mono/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
-        data/$test exp/mono/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/mono/graph_nosp \
+        data/$language/$test exp/$language/mono/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_nosp_test exp/mono exp/mono_ali
+    data/$language/train data/$language/lang_nosp_test exp/$language/mono exp/$language/mono_ali
 fi
 
 # train a first delta + delta-delta triphone system on all utterances
 if [ $stage -le 7 ]; then
   steps/train_deltas.sh --cmd "$train_cmd" \
-    2000 30000 data/train data/lang_nosp_test exp/mono_ali exp/tri1
+    $numLeavesTri1 $numGaussTri1 data/$language/train data/$language/lang_nosp_test exp/$language/mono_ali exp/$language/tri1
 
   # decode using the tri1 model
   (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
+    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri1 exp/$language/tri1/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
-        data/$test exp/tri1/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri1/graph_nosp \
+        data/$language/$test exp/$language/tri1/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
+    data/$language/train data/$language/lang_nosp_test exp/$language/tri1 exp/$language/tri1_ali
 fi
 
 # train an LDA+MLLT system.
 if [ $stage -le 8 ]; then
   steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=3 --right-context=3" 3000 60000 \
-    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
+    --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \
+    data/$language/train data/$language/lang_nosp_test exp/$language/tri1_ali exp/$language/tri2
 
   # decode using the LDA+MLLT model
   (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
+    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri2 exp/$language/tri2/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
-        data/$test exp/tri2/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri2/graph_nosp \
+        data/$language/$test exp/$language/tri2/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
-    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
+    data/$language/train data/$language/lang_nosp_test exp/$language/tri2 exp/$language/tri2_ali
 fi
 
 # Train tri3, which is LDA+MLLT+SAT
 if [ $stage -le 9 ]; then
-  steps/train_sat.sh --cmd "$train_cmd" 6000 80000 \
-    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
+  steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \
+    data/$language/train data/$language/lang_nosp_test exp/$language/tri2_ali exp/$language/tri3
 
   # decode using the tri3 model
   (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
+    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri3 exp/$language/tri3/graph_nosp
     for test in dev; do
-      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
-        data/$test exp/tri3/decode_nosp_$test
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri3/graph_nosp \
+        data/$language/$test exp/$language/tri3/decode_nosp_$test
     done
   )&
 fi
@@ -126,28 +127,28 @@ fi
 # Now we compute the pronunciation and silence probabilities from training data,
 # and re-create the lang directory.
 if [ $stage -le 10 ]; then
-  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
+  steps/get_prons.sh --cmd "$train_cmd" data/$language/train data/$language/lang_nosp_test exp/$language/tri3
   utils/dict_dir_add_pronprobs.sh --max-normalize true \
-    data/local/dict_nosp \
-    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
-    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
+    data/$language/local/dict_nosp \
+    exp/$language/tri3/pron_counts_nowb.txt exp/$language/tri3/sil_counts_nowb.txt \
+    exp/$language/tri3/pron_bigram_counts_nowb.txt data/$language/local/dict
 
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
+  utils/prepare_lang.sh data/$language/local/dict "<unk>" data/$language/local/lang data/$language/lang
 
-  utils/format_lm.sh data/lang data/lm/lm.gz \
-    data/local/dict/lexiconp.txt data/lang_test
+  utils/format_lm.sh data/$language/lang data/$language/lm/lm.gz \
+    data/$language/local/dict/lexiconp.txt data/$language/lang_test
 
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_test exp/tri3 exp/tri3_ali
+    data/$language/train data/$language/lang_test exp/$language/tri3 exp/$language/tri3_ali
 fi
 
 if [ $stage -le 11 ]; then
   # Test the tri3 system with the silprobs and pron-probs.
 
   # decode using the tri3 model
-  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
+  utils/mkgraph.sh data/$language/lang_test exp/$language/tri3 exp/$language/tri3/graph
   for test in dev; do
     steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
-      exp/tri3/graph data/$test exp/tri3/decode_$test
+      exp/$language/tri3/graph data/$language/$test exp/$language/tri3/decode_$test
   done
 fi
diff --git a/egs/material/s5_tagalog/cmd.sh b/egs/material/s5_tagalog/cmd.sh
deleted file mode 100644
index 811adcde474..00000000000
--- a/egs/material/s5_tagalog/cmd.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# you can change cmd.sh depending on what type of queue you are using.
-# If you have no queueing system and want to run on a local machine, you
-# can change all instances 'queue.pl' to run.pl (but be careful and run
-# commands one by one: most recipes will exhaust the memory on your
-# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
-# with slurm.  Different queues are configured differently, with different
-# queue names and different ways of specifying things like memory;
-# to account for these differences you can create and edit the file
-# conf/queue.conf to match your queue's configuration.  Search for
-# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
-# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
-
-export train_cmd="queue.pl --mem 2G"
-export decode_cmd="queue.pl --mem 4G"
diff --git a/egs/material/s5_tagalog/conf b/egs/material/s5_tagalog/conf
deleted file mode 120000
index 2b5d689c5e2..00000000000
--- a/egs/material/s5_tagalog/conf
+++ /dev/null
@@ -1 +0,0 @@
-../s5/conf
\ No newline at end of file
diff --git a/egs/material/s5_tagalog/path.sh b/egs/material/s5_tagalog/path.sh
deleted file mode 100644
index ffa108b6737..00000000000
--- a/egs/material/s5_tagalog/path.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sph2pipe_v2.5/:$PWD:$PATH
-[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
-. $KALDI_ROOT/tools/config/common_path.sh
-[ ! -f $KALDI_ROOT/tools/env.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/env.sh is not present (this is uncommon but might be OK)"
-. $KALDI_ROOT/tools/env.sh
-export LC_ALL=C
diff --git a/egs/material/s5_tagalog/run.sh b/egs/material/s5_tagalog/run.sh
deleted file mode 100755
index 08c115a36d2..00000000000
--- a/egs/material/s5_tagalog/run.sh
+++ /dev/null
@@ -1,155 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
-# License: Apache 2.0
-
-# Begin configuration section.
-# End configuration section
-. ./path.sh
-. ./cmd.sh
-
-nj=30 # number of parallel jobs
-stage=1
-. utils/parse_options.sh
-
-set -e -o pipefail
-set -o nounset                              # Treat unset variables as an error
-
-#corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
-corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
-
-if [ $stage -le 1 ]; then
-  local/prepare_text_data.sh $corpus
-  local/prepare_audio_data.sh $corpus
-fi
-
-if [ $stage -le 2 ]; then
-  local/prepare_dict.sh $corpus
-  utils/validate_dict_dir.pl data/local/dict_nosp
-  utils/prepare_lang.sh data/local/dict_nosp \
-    "<unk>" data/local/lang_nosp data/lang_nosp
-  utils/validate_lang.pl data/lang_nosp
-fi
-
-if [ $stage -le 3 ]; then
-  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
-    data/lang_nosp/words.txt data data/lm
-  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
-    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
-  utils/validate_lang.pl data/lang_nosp_test
-fi
-
-if [ $stage -le 4 ]; then
-  for set in train dev; do
-    dir=data/$set
-    utils/fix_data_dir.sh $dir
-    steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
-    steps/compute_cmvn_stats.sh $dir
-    utils/fix_data_dir.sh $dir
-    utils/validate_data_dir.sh $dir
-  done
-fi
-
-# Create a subset with 40k short segments to make flat-start training easier
-if [ $stage -le 5 ]; then
-  utils/subset_data_dir.sh --shortest data/train 45000 data/train_45kshort
-fi
-
-# monophone training
-if [ $stage -le 6 ]; then
-  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
-    data/train_45kshort data/lang_nosp_test exp/mono
-  (
-    utils/mkgraph.sh data/lang_nosp_test \
-      exp/mono exp/mono/graph_nosp
-    for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
-        data/$test exp/mono/decode_nosp_$test
-    done
-  )&
-
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_nosp_test exp/mono exp/mono_ali
-fi
-
-# train a first delta + delta-delta triphone system on all utterances
-if [ $stage -le 7 ]; then
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_nosp_test exp/mono exp/mono_ali
-  steps/train_deltas.sh --cmd "$train_cmd" \
-    4000 60000 data/train data/lang_nosp_test exp/mono_ali exp/tri1
-
-  # decode using the tri1 model
-  (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
-    for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
-        data/$test exp/tri1/decode_nosp_$test
-    done
-  )&
-
-  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
-fi
-
-# train an LDA+MLLT system.
-if [ $stage -le 8 ]; then
-  steps/train_lda_mllt.sh --cmd "$train_cmd" \
-    --splice-opts "--left-context=3 --right-context=3" 5000 80000 \
-    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
-
-  # decode using the LDA+MLLT model
-  (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
-    for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
-        data/$test exp/tri2/decode_nosp_$test
-    done
-  )&
-
-  steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
-    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
-fi
-
-# Train tri3, which is LDA+MLLT+SAT
-if [ $stage -le 9 ]; then
-  steps/train_sat.sh --cmd "$train_cmd" 7000 100000 \
-    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
-
-  # decode using the tri3 model
-  (
-    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
-    for test in dev; do
-      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
-        data/$test exp/tri3/decode_nosp_$test
-    done
-  )&
-fi
-
-# Now we compute the pronunciation and silence probabilities from training data,
-# and re-create the lang directory.
-if [ $stage -le 10 ]; then
-  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
-  utils/dict_dir_add_pronprobs.sh --max-normalize true \
-    data/local/dict_nosp \
-    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
-    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
-
-  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
-
-  utils/format_lm.sh data/lang data/lm/lm.gz \
-    data/local/dict/lexiconp.txt data/lang_test
-
-  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/train data/lang_test exp/tri3 exp/tri3_ali
-fi
-
-if [ $stage -le 11 ]; then
-  # Test the tri3 system with the silprobs and pron-probs.
-
-  # decode using the tri3 model
-  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
-  for test in dev; do
-    steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
-      exp/tri3/graph data/$test exp/tri3/decode_$test
-  done
-fi

From d0985187e9a2ad1637bd262bb3844990055736e5 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Wed, 17 Jan 2018 11:37:43 -0500
Subject: [PATCH 169/184] create one single rnnlm script for all material
 languages

---
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh  |  28 ++--
 .../s5_tagalog/local/rnnlm/run_tdnn_lstm.sh   | 121 ------------------
 2 files changed, 16 insertions(+), 133 deletions(-)
 delete mode 100755 egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh

diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
index c541a2f660b..19f49d873ef 100755
--- a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -5,17 +5,19 @@
 
 # This script trains LMs on the swbd LM-training data.
 
+
+# [for swahili]
 # rnnlm/train_rnnlm.sh: best iteration (out of 10) was 3, linking it to final iteration.
-# rnnlm/train_rnnlm.sh: train/dev perplexity was 105.1 / 223.6.
-# Train objf: -5.72 -5.28 -4.92 -4.64 -4.36 -4.09 -3.85 -3.62 -3.40 -3.23 
-# Dev objf:   -9.99 -5.71 -5.43 -5.41 -5.52 -5.69 -5.86 -6.09 -6.29 -6.49 
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 113.6 / 222.1.                   
+# Train objf: -5.79 -5.37 -5.00 -4.72 -4.47 -4.18 -3.92 -3.68 -3.48 -3.28         
+# Dev objf:   -9.99 -5.79 -5.47 -5.40 -5.46 -5.62 -5.82 -6.01 -6.22 -6.42  
 
-# %WER 39.14 [ 24322 / 62144, 3199 ins, 6127 del, 14996 sub ] exp/chain/tdnn_lstm1a_tree6000_sp_ld5/decode_dev/wer_10_0.0
-# %WER 37.60 [ 23367 / 62144, 3129 ins, 5918 del, 14320 sub ] exp/chain/tdnn_lstm1a_tree6000_sp_ld5/decode_dev_rnnlm_1a/wer_9_0.5
+# %WER 39.27 [ 24406 / 62144, 2835 ins, 6472 del, 15099 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_1.0
+# %WER 37.67 [ 23408 / 62144, 3238 ins, 5998 del, 14172 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev_rnnlm_1a/wer_10_0.0
 
 # Begin configuration section.
+language=swahili
 
-dir=exp/rnnlm_lstm_1a
 embedding_dim=512
 lstm_rpd=128
 lstm_nrpd=128
@@ -24,20 +26,22 @@ train_stage=-10
 
 # variables for lattice rescoring
 run_rescore=true
-ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
 decode_dir_suffix=rnnlm_1a
 ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
               # if it's set, it merges histories in the lattice if they share
               # the same ngram history and this prevents the lattice from 
               # exploding exponentially
 pruned_rescore=true
+language=swahili
 
 . ./cmd.sh
 . ./utils/parse_options.sh
 
-text=data/train/text
-lexicon=data/local/dict_nosp/lexiconp.txt
-text_dir=data/rnnlm/text_nosp_1e
+ac_model_dir=exp/$language/chain/tdnn_lstm1a_sp_ld5
+dir=exp/$language/rnnlm_lstm_1a
+text=data/$language/train/text
+lexicon=data/$language/local/dict_nosp/lexiconp.txt
+text_dir=data/$language/rnnlm/text_nosp_1e
 mkdir -p $dir/config
 set -e
 
@@ -112,8 +116,8 @@ if [ $stage -le 4 ] && $run_rescore; then
     rnnlm/lmrescore$pruned.sh \
       --cmd "$decode_cmd --mem 4G" \
       --weight 0.5 --max-ngram-order $ngram_order \
-      data/lang_$LM $dir \
-      data/${decode_set}_hires ${decode_dir} \
+      data/$language/lang_$LM $dir \
+      data/$language/${decode_set}_hires ${decode_dir} \
       ${decode_dir}_${decode_dir_suffix}
   done
 fi
diff --git a/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh
deleted file mode 100755
index 877569c3501..00000000000
--- a/egs/material/s5_tagalog/local/rnnlm/run_tdnn_lstm.sh
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-#           2017  Hainan Xu
-
-# This script trains LMs on the swbd LM-training data.
-
-# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 3, linking it to final iteration.
-# rnnlm/train_rnnlm.sh: train/dev perplexity was 65.0 / 106.6.
-# Train objf: -5.17 -4.66 -4.37 -4.17 -3.99 -3.82 -3.66 -3.52 -3.39 -3.27 
-# Dev objf:   -9.55 -5.02 -4.73 -4.67 -4.69 -4.77 -4.90 -5.02 -5.16 -5.29 
-
-# %WER 46.51 [ 29942 / 64382, 3413 ins, 9336 del, 17193 sub ] exp/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_0.5
-# %WER 44.67 [ 28762 / 64382, 3276 ins, 9266 del, 16220 sub ] exp/chain/tdnn_lstm1a_sp_ld5/decode_dev_rnnlm_1a/wer_9_0.5
-
-# Begin configuration section.
-
-dir=exp/rnnlm_lstm_1a
-embedding_dim=256
-lstm_rpd=64
-lstm_nrpd=64
-stage=-10
-train_stage=-10
-
-# variables for lattice rescoring
-run_rescore=true
-ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
-decode_dir_suffix=rnnlm_1a
-ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
-              # if it's set, it merges histories in the lattice if they share
-              # the same ngram history and this prevents the lattice from 
-              # exploding exponentially
-pruned_rescore=true
-
-. ./cmd.sh
-. ./utils/parse_options.sh
-
-text=data/train/text
-lexicon=data/local/dict_nosp/lexiconp.txt
-text_dir=data/rnnlm/text_nosp_1e
-mkdir -p $dir/config
-set -e
-
-for f in $text $lexicon; do
-  [ ! -f $f ] && \
-    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
-done
-
-if [ $stage -le 0 ]; then
-  mkdir -p $text_dir
-  echo -n >$text_dir/dev.txt
-  # hold out one in every 50 lines as dev data.
-  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/train.txt
-fi
-
-if [ $stage -le 1 ]; then
-  cp data/lang/words.txt $dir/config/
-  n=`cat $dir/config/words.txt | wc -l`
-  echo "<brk> $n" >> $dir/config/words.txt
-
-  # words that are not present in words.txt but are in the training or dev data, will be
-  # mapped to <SPOKEN_NOISE> during training.
-  echo "<unk>" >$dir/config/oov.txt
-
-  cat > $dir/config/data_weights.txt <<EOF
-train   10   1.0
-EOF
-
-  rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
-                             --unk-word="<unk>" \
-                             --data-weights-file=$dir/config/data_weights.txt \
-                             $text_dir | awk 'NF==2' >$dir/config/unigram_probs.txt
-
-  # choose features
-  rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \
-                           --use-constant-feature=true \
-                           --special-words='<s>,</s>,<brk>,<unk>,<noise>,<spnoise>,<sil>' \
-                           $dir/config/words.txt > $dir/config/features.txt
-
-  cat >$dir/config/xconfig <<EOF
-input dim=$embedding_dim name=input
-relu-renorm-layer name=tdnn1 dim=$embedding_dim input=Append(0, IfDefined(-1))
-fast-lstmp-layer name=lstm1 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
-relu-renorm-layer name=tdnn2 dim=$embedding_dim input=Append(0, IfDefined(-2))
-fast-lstmp-layer name=lstm2 cell-dim=$embedding_dim recurrent-projection-dim=$lstm_rpd non-recurrent-projection-dim=$lstm_nrpd
-relu-renorm-layer name=tdnn3 dim=$embedding_dim input=Append(0, IfDefined(-1))
-output-layer name=output include-log-softmax=false dim=$embedding_dim
-EOF
-  rnnlm/validate_config_dir.sh $text_dir $dir/config
-fi
-
-if [ $stage -le 2 ]; then
-  rnnlm/prepare_rnnlm_dir.sh $text_dir $dir/config $dir
-fi
-
-if [ $stage -le 3 ]; then
-  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 \
-                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
-fi
-
-LM=test
-if [ $stage -le 4 ] && $run_rescore; then
-  echo "$0: Perform lattice-rescoring on $ac_model_dir"
-  pruned=
-  if $pruned_rescore; then
-    pruned=_pruned
-  fi
-  for decode_set in dev; do
-    decode_dir=${ac_model_dir}/decode_${decode_set}
-
-    # Lattice rescoring
-    rnnlm/lmrescore$pruned.sh \
-      --cmd "$decode_cmd --mem 4G" \
-      --weight 0.5 --max-ngram-order $ngram_order \
-      data/lang_$LM $dir \
-      data/${decode_set}_hires ${decode_dir} \
-      ${decode_dir}_${decode_dir_suffix}
-  done
-fi
-
-exit 0

From 6ed881b06c545601444d575cc0d07ce122b81357 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Fri, 19 Jan 2018 14:48:17 -0500
Subject: [PATCH 170/184] fix stage numbers in the tdnn-lstm recipe

---
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 34 +++++++------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index bb1be7ec4b2..c3e82c1e8cc 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -1,21 +1,11 @@
 #!/bin/bash
 
-# 1e is as 1d but instead of the --proportional-shrink option, using
-#  the newly added xconfig-layer-specific 'l2-regularize' options.
+# tdnn-lstm recipe
+# cat exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/scoring_kaldi/best_wer
+# %WER 39.27 [ 24406 / 62144, 2835 ins, 6472 del, 15099 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_1.0
 
-# local/chain/compare_wer.sh exp/chain/tdnn1d_sp exp/chain/tdnn1e_sp
-# System                tdnn1d_sp tdnn1e_sp
-#WER dev_clean_2 (tgsmall)      14.21     13.43
-#WER dev_clean_2 (tglarge)      10.41      9.76
-# Final train prob        -0.0473   -0.0510
-# Final valid prob        -0.0893   -0.0889
-# Final train prob (xent)   -1.0757   -1.4148
-# Final valid prob (xent)   -1.4222   -1.6640
-
-
-# steps/info/chain_dir_info.pl exp/chain/tdnn1{d,e}_sp
-# exp/chain/tdnn1d_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.063->-0.052 xent:train/valid[10,16,final]=(-1.65,-1.23,-1.08/-1.91,-1.55,-1.42) logprob:train/valid[10,16,final]=(-0.084,-0.057,-0.047/-0.125,-0.100,-0.089)
-# exp/chain/tdnn1e_sp: num-iters=17 nj=2..5 num-params=7.5M dim=40+100->2309 combine=-0.061->-0.056 xent:train/valid[10,16,final]=(-1.69,-1.41,-1.41/-1.91,-1.67,-1.66) logprob:train/valid[10,16,final]=(-0.065,-0.055,-0.051/-0.104,-0.095,-0.089)
+# steps/info/chain_dir_info.pl exp/swahili/chain/tdnn_lstm1a_sp_ld5
+# exp/swahili/chain/tdnn_lstm1a_sp_ld5: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.175->-0.170 (over 3) xent:train/valid[45,69,final]=(-1.73,-1.51,-1.49/-1.89,-1.76,-1.74) logprob:train/valid[45,69,final]=(-0.187,-0.159,-0.158/-0.219,-0.210,-0.209)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
@@ -98,7 +88,7 @@ for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
 done
 
-if [ $stage -le 10 ]; then
+if [ $stage -le 7 ]; then
   echo "$0: creating lang directory $lang with chain-type topology"
   # Create a version of the lang/ directory that has one state per phone in the
   # topo file. [note, it really has two states.. the first one is only repeated
@@ -121,7 +111,7 @@ if [ $stage -le 10 ]; then
   fi
 fi
 
-if [ $stage -le 11 ]; then
+if [ $stage -le 8 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
@@ -129,7 +119,7 @@ if [ $stage -le 11 ]; then
   rm $lat_dir/fsts.*.gz # save space
 fi
 
-if [ $stage -le 12 ]; then
+if [ $stage -le 9 ]; then
   # Build a tree using our new topology.  We know we have alignments for the
   # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
   # those.  The num-leaves is always somewhat less than the num-leaves from
@@ -145,7 +135,7 @@ if [ $stage -le 12 ]; then
     $lang $ali_dir $tree_dir
 fi
 
-if [ $stage -le 13 ]; then
+if [ $stage -le 10 ]; then
   mkdir -p $dir
   echo "$0: creating neural net configs using the xconfig parser";
 
@@ -199,7 +189,7 @@ EOF
 fi
 
 
-if [ $stage -le 14 ]; then
+if [ $stage -le 11 ]; then
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
     utils/create_split_dir.pl \
      /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
@@ -242,7 +232,7 @@ if [ $stage -le 14 ]; then
     --dir=$dir  || exit 1;
 fi
 
-if [ $stage -le 15 ]; then
+if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
@@ -250,7 +240,7 @@ if [ $stage -le 15 ]; then
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
-if [ $stage -le 16 ]; then
+if [ $stage -le 13 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   rm $dir/.error 2>/dev/null || true
 

From 64ac21ebb0a9aabc96f97af8eba670764645b601 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Sat, 20 Jan 2018 02:22:01 -0500
Subject: [PATCH 171/184] remove $language subdir in /exp and /data

---
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh | 34 ++++---
 .../s5/local/nnet3/run_ivector_common.sh      | 55 ++++++------
 egs/material/s5/local/prepare_audio_data.sh   |  9 +-
 egs/material/s5/local/prepare_dict.sh         | 29 +++---
 egs/material/s5/local/prepare_text_data.sh    | 21 +++--
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh  | 16 ++--
 egs/material/s5/run.sh                        | 89 ++++++++++---------
 7 files changed, 123 insertions(+), 130 deletions(-)

diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index c3e82c1e8cc..77bcd1b564d 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -17,7 +17,6 @@ decode_nj=30
 train_set=train
 test_sets=dev
 gmm=tri3
-language=swahili
 nnet3_affix=
 
 # The rest are configs specific to this script.  Most of the parameters
@@ -67,21 +66,20 @@ fi
 local/nnet3/run_ivector_common.sh --stage $stage \
                                   --train-set $train_set \
                                   --gmm $gmm \
-                                  --language $language \
                                   --nnet3-affix "$nnet3_affix" || exit 1;
 
 # Problem: We have removed the "train_" prefix of our training set in
 # the alignment directory names! Bad!
-gmm_dir=exp/$language/$gmm
-ali_dir=exp/$language/${gmm}_ali_${train_set}_sp
-tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
-lang=data/$language/lang_chain
-lat_dir=exp/$language/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
-dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+gmm_dir=exp/$gmm
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
 if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
-train_data_dir=data/$language/${train_set}_sp_hires
-lores_train_data_dir=data/$language/${train_set}_sp
-train_ivector_dir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+train_data_dir=data/${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
 
 for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
     $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do
@@ -94,15 +92,15 @@ if [ $stage -le 7 ]; then
   # topo file. [note, it really has two states.. the first one is only repeated
   # once, the second one has zero or more repeats.]
   if [ -d $lang ]; then
-    if [ $lang/L.fst -nt data/$language/lang_test/L.fst ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
       echo "$0: $lang already exists, not overwriting it; continuing"
     else
-      echo "$0: $lang already exists and seems to be older than data/$language/lang..."
+      echo "$0: $lang already exists and seems to be older than data/lang..."
       echo " ... not sure what to do.  Exiting."
       exit 1;
     fi
   else
-    cp -r data/$language/lang_test $lang
+    cp -r data/lang_test $lang
     silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
     nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
     # Use our special topology... note that later on may have to tune this
@@ -115,7 +113,7 @@ if [ $stage -le 8 ]; then
   # Get the alignments as lattices (gives the chain training more freedom).
   # use the same num-jobs as the alignments
   steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
-    data/$language/lang_test $gmm_dir $lat_dir
+    data/lang_test $gmm_dir $lat_dir
   rm $lat_dir/fsts.*.gz # save space
 fi
 
@@ -236,7 +234,7 @@ if [ $stage -le 12 ]; then
   # Note: it's not important to give mkgraph.sh the lang directory with the
   # matched topology (since it gets the topology file from the model).
   utils/mkgraph.sh \
-    --self-loop-scale 1.0 data/$language/lang_test \
+    --self-loop-scale 1.0 data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
 fi
 
@@ -255,8 +253,8 @@ if [ $stage -le 13 ]; then
           --extra-right-context-final 0 \
           --frames-per-chunk $frames_per_chunk \
           --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-          --online-ivector-dir exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph data/$language/${data}_hires ${dir}/decode_${data} || exit 1
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/material/s5/local/nnet3/run_ivector_common.sh b/egs/material/s5/local/nnet3/run_ivector_common.sh
index 29be7925c79..a56b3bf67d8 100755
--- a/egs/material/s5/local/nnet3/run_ivector_common.sh
+++ b/egs/material/s5/local/nnet3/run_ivector_common.sh
@@ -13,7 +13,6 @@ train_set=train
 test_sets="dev"
 nj=30
 gmm=tri3
-language=swahili
 
 nnet3_affix=
 
@@ -21,10 +20,10 @@ nnet3_affix=
 . ./path.sh
 . utils/parse_options.sh
 
-gmm_dir=exp/$language/${gmm}
-ali_dir=exp/$language/${gmm}_ali_${train_set}_sp
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
 
-for f in data/$language/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
+for f in data/${train_set}/feats.scp ${gmm_dir}/final.mdl; do
   if [ ! -f $f ]; then
     echo "$0: expected file $f to exist"
     exit 1
@@ -35,53 +34,53 @@ if [ $stage -le 1 ]; then
   # Although the nnet will be trained by high resolution data, we still have to
   # perturb the normal data to get the alignment _sp stands for speed-perturbed
   echo "$0: preparing directory for low-resolution speed-perturbed data (for alignment)"
-  utils/data/perturb_data_dir_speed_3way.sh data/$language/${train_set} data/$language/${train_set}_sp
+  utils/data/perturb_data_dir_speed_3way.sh data/${train_set} data/${train_set}_sp
   echo "$0: making MFCC features for low-resolution speed-perturbed data"
-  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/$language/${train_set}_sp || exit 1;
-  steps/compute_cmvn_stats.sh data/$language/${train_set}_sp || exit 1;
-  utils/fix_data_dir.sh data/$language/${train_set}_sp
+  steps/make_mfcc.sh --cmd "$train_cmd" --nj $nj data/${train_set}_sp || exit 1;
+  steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1;
+  utils/fix_data_dir.sh data/${train_set}_sp
 fi
 
 if [ $stage -le 2 ]; then
   echo "$0: aligning with the perturbed low-resolution data"
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/$language/${train_set}_sp data/$language/lang_test $gmm_dir $ali_dir || exit 1
+    data/${train_set}_sp data/lang_test $gmm_dir $ali_dir || exit 1
 fi
 
 if [ $stage -le 3 ]; then
   # Create high-resolution MFCC features (with 40 cepstra instead of 13).
   # this shows how you can split across multiple file-systems.
   echo "$0: creating high-resolution MFCC features"
-  mfccdir=data/$language/${train_set}_sp_hires/data
+  mfccdir=data/${train_set}_sp_hires/data
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $mfccdir/storage ]; then
     utils/create_split_dir.pl /export/b1{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$mfccdir/storage $mfccdir/storage
   fi
 
   for datadir in ${train_set}_sp ${test_sets}; do
-    utils/copy_data_dir.sh data/$language/$datadir data/$language/${datadir}_hires
+    utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
   done
 
   # do volume-perturbation on the training data prior to extracting hires
   # features; this helps make trained nnets more invariant to test data volume.
-  utils/data/perturb_data_dir_volume.sh data/$language/${train_set}_sp_hires || exit 1;
+  utils/data/perturb_data_dir_volume.sh data/${train_set}_sp_hires || exit 1;
 
   for datadir in ${train_set}_sp ${test_sets}; do
     steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-      --cmd "$train_cmd" data/$language/${datadir}_hires || exit 1;
-    steps/compute_cmvn_stats.sh data/$language/${datadir}_hires || exit 1;
-    utils/fix_data_dir.sh data/$language/${datadir}_hires || exit 1;
+      --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+    steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+    utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
   done
 fi
 
 if [ $stage -le 4 ]; then
   echo "$0: computing a subset of data to train the diagonal UBM."
   # We'll use about a quarter of the data.
-  mkdir -p exp/$language/nnet3${nnet3_affix}/diag_ubm
-  temp_data_root=exp/$language/nnet3${nnet3_affix}/diag_ubm
+  mkdir -p exp/nnet3${nnet3_affix}/diag_ubm
+  temp_data_root=exp/nnet3${nnet3_affix}/diag_ubm
 
-  num_utts_total=$(wc -l <data/$language/${train_set}_sp_hires/utt2spk)
+  num_utts_total=$(wc -l <data/${train_set}_sp_hires/utt2spk)
   num_utts=$[$num_utts_total/4]
-  utils/data/subset_data_dir.sh data/$language/${train_set}_sp_hires \
+  utils/data/subset_data_dir.sh data/${train_set}_sp_hires \
      $num_utts ${temp_data_root}/${train_set}_sp_hires_subset
 
   echo "$0: computing a PCA transform from the hires data."
@@ -89,7 +88,7 @@ if [ $stage -le 4 ]; then
       --splice-opts "--left-context=3 --right-context=3" \
       --max-utts 10000 --subsample 2 \
        ${temp_data_root}/${train_set}_sp_hires_subset \
-       exp/$language/nnet3${nnet3_affix}/pca_transform
+       exp/nnet3${nnet3_affix}/pca_transform
 
   echo "$0: training the diagonal UBM."
   # Use 512 Gaussians in the UBM.
@@ -97,7 +96,7 @@ if [ $stage -le 4 ]; then
     --num-frames 700000 \
     --num-threads 32 \
     ${temp_data_root}/${train_set}_sp_hires_subset 512 \
-    exp/$language/nnet3${nnet3_affix}/pca_transform exp/$language/nnet3${nnet3_affix}/diag_ubm
+    exp/nnet3${nnet3_affix}/pca_transform exp/nnet3${nnet3_affix}/diag_ubm
 fi
 
 if [ $stage -le 5 ]; then
@@ -106,8 +105,8 @@ if [ $stage -le 5 ]; then
   # 100.
   echo "$0: training the iVector extractor"
   steps/online/nnet2/train_ivector_extractor.sh --cmd "$train_cmd" --nj $nj \
-     data/$language/${train_set}_sp_hires exp/$language/nnet3${nnet3_affix}/diag_ubm \
-     exp/$language/nnet3${nnet3_affix}/extractor || exit 1;
+     data/${train_set}_sp_hires exp/nnet3${nnet3_affix}/diag_ubm \
+     exp/nnet3${nnet3_affix}/extractor || exit 1;
 fi
 
 
@@ -122,7 +121,7 @@ if [ $stage -le 6 ]; then
   # that's the data we extract the ivectors from, as it's still going to be
   # valid for the non-'max2' data, the utterance list is the same.
 
-  ivectordir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+  ivectordir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
   if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $ivectordir/storage ]; then
     utils/create_split_dir.pl /export/b0{5,6,7,8}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$ivectordir/storage $ivectordir/storage
   fi
@@ -132,18 +131,18 @@ if [ $stage -le 6 ]; then
   # handle per-utterance decoding well (iVector starts at zero).
   temp_data_root=${ivectordir}
   utils/data/modify_speaker_info.sh --utts-per-spk-max 2 \
-    data/$language/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
+    data/${train_set}_sp_hires ${temp_data_root}/${train_set}_sp_hires_max2
 
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
     ${temp_data_root}/${train_set}_sp_hires_max2 \
-    exp/$language/nnet3${nnet3_affix}/extractor $ivectordir
+    exp/nnet3${nnet3_affix}/extractor $ivectordir
 
   # Also extract iVectors for the test data, but in this case we don't need the speed
   # perturbation (sp).
   for data in $test_sets; do
     steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-      data/$language/${data}_hires exp/$language/nnet3${nnet3_affix}/extractor \
-      exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires
+      data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+      exp/nnet3${nnet3_affix}/ivectors_${data}_hires
   done
 fi
 
diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
index d78e261e999..372619b63a2 100755
--- a/egs/material/s5/local/prepare_audio_data.sh
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -10,13 +10,12 @@ echo "$0 " "$@"
 
 if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus> <language-name>"
+  echo "  $0 <path-to-material-corpus>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
   exit
 fi
 data=$1
-language=$2
 
 conversational_train=$data/conversational/training/
 audio=$conversational_train/audio/
@@ -24,7 +23,7 @@ audio=$conversational_train/audio/
   echo "The directory $audio does not exist!" && exit 1
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl > data/$language/train/wav.scp
+  local/audio2wav_scp.pl > data/train/wav.scp
 
 
 conversational_dev=$data/conversational/dev
@@ -33,5 +32,5 @@ audio=$conversational_dev/audio/
   echo "The directory $audio does not exist!" && exit 1
 
 find $audio -type f \( -name "*.wav" -o -name "*.sph" \) | \
-  local/audio2wav_scp.pl > data/$language/dev/wav.scp
+  local/audio2wav_scp.pl > data/dev/wav.scp
 
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
index dcfdb8f051e..51ac6fe13e8 100755
--- a/egs/material/s5/local/prepare_dict.sh
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -10,34 +10,33 @@ echo "$0 " "$@"
 
 if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus> <language-name>"
+  echo "  $0 <path-to-material-corpus>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
   exit
 fi
 data=$1
-language=$2
 
 lexicon=$data/conversational/reference_materials/lexicon.txt
 [ ! -f $lexicon ] && echo "Lexicon $lexicon does not exist!" && exit 1;
 
-mkdir -p data/$language/local/dict_nosp/
-cat data/$language/train/text | cut -f 2- -d ' ' | \
-  sed 's/ /\n/g' | sort -u > data/$language/local/dict_nosp/wordlist
+mkdir -p data/local/dict_nosp/
+cat data/train/text | cut -f 2- -d ' ' | \
+  sed 's/ /\n/g' | sort -u > data/local/dict_nosp/wordlist
 
-local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/$language/local/dict_nosp/wordlist | sort -u > data/$language/local/dict_nosp/lexicon.txt
-[ -f  data/$language/local/dict_nosp/lexiconp.txt ] && rm data/$language/local/dict_nosp/lexiconp.txt
+local/convert_lexicon.pl <(echo -e "<unk>\t<unk>\n<sil>\t<sil>\n<noise>\t<noise>\n<spnoise>\t<spnoise>" | cat - $lexicon ) data/local/dict_nosp/wordlist | sort -u > data/local/dict_nosp/lexicon.txt
+[ -f  data/local/dict_nosp/lexiconp.txt ] && rm data/local/dict_nosp/lexiconp.txt
 
-cat data/$language/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
-  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/$language/local/dict_nosp/phones.txt
+cat data/local/dict_nosp/lexicon.txt | sed 's/\t/ /g' | \
+  cut -f 2- -d ' ' | sed 's/ /\n/g' | sort -u > data/local/dict_nosp/phones.txt
 
 
-grep "^<.*>$" data/$language/local/dict_nosp/phones.txt  > data/$language/local/dict_nosp/silence_phones.txt
-grep -v "^<.*>$" data/$language/local/dict_nosp/phones.txt  > data/$language/local/dict_nosp/nonsilence_phones.txt
-echo "<sil>" > data/$language/local/dict_nosp/optional_silence.txt
-echo "<unk>" > data/$language/local/dict_nosp/oov.txt
+grep "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/silence_phones.txt
+grep -v "^<.*>$" data/local/dict_nosp/phones.txt  > data/local/dict_nosp/nonsilence_phones.txt
+echo "<sil>" > data/local/dict_nosp/optional_silence.txt
+echo "<unk>" > data/local/dict_nosp/oov.txt
 
 
 
-utils/validate_dict_dir.pl data/$language/local/dict_nosp/
+utils/validate_dict_dir.pl data/local/dict_nosp/
 
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
index f8981bac840..89d2d057d62 100755
--- a/egs/material/s5/local/prepare_text_data.sh
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -10,34 +10,33 @@ echo "$0 " "$@"
 
 if [ $# -ne 2 ] ; then
   echo "Invalid number of script parameters. "
-  echo "  $0 <path-to-material-corpus> <language-name>"
+  echo "  $0 <path-to-material-corpus>"
   echo "e.g."
-  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/ swahili"
+  echo "  $0 /export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/"
   exit
 fi
 data=$1;
-language=$2
 conversational_train=$data/conversational/training/
-mkdir -p data/$language/train/
+mkdir -p data/train/
 for file in $conversational_train/transcription/*txt ; do
   ./local/parse_transcripts.pl $file
-done  > data/$language/train/transcripts.txt
+done  > data/train/transcripts.txt
 
 
 conversational_dev=$data/conversational/dev/
-mkdir -p data/$language/dev
+mkdir -p data/dev
 for file in $conversational_dev/transcription/*txt ; do
   ./local/parse_transcripts.pl $file
-done > data/$language/dev/transcripts.txt
+done > data/dev/transcripts.txt
 
 
-cat data/$language/train/transcripts.txt | \
+cat data/train/transcripts.txt | \
   local/cleanup_transcripts.pl | \
-  local/create_datafiles.pl data/$language/train/
+  local/create_datafiles.pl data/train/
 
-cat data/$language/dev/transcripts.txt | \
+cat data/dev/transcripts.txt | \
   local/cleanup_transcripts.pl | \
-  local/create_datafiles.pl data/$language/dev/
+  local/create_datafiles.pl data/dev/
 
 
 
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
index 19f49d873ef..c6ac6842694 100755
--- a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -16,7 +16,6 @@
 # %WER 37.67 [ 23408 / 62144, 3238 ins, 5998 del, 14172 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev_rnnlm_1a/wer_10_0.0
 
 # Begin configuration section.
-language=swahili
 
 embedding_dim=512
 lstm_rpd=128
@@ -32,16 +31,15 @@ ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-orde
               # the same ngram history and this prevents the lattice from 
               # exploding exponentially
 pruned_rescore=true
-language=swahili
 
 . ./cmd.sh
 . ./utils/parse_options.sh
 
-ac_model_dir=exp/$language/chain/tdnn_lstm1a_sp_ld5
-dir=exp/$language/rnnlm_lstm_1a
-text=data/$language/train/text
-lexicon=data/$language/local/dict_nosp/lexiconp.txt
-text_dir=data/$language/rnnlm/text_nosp_1e
+ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
+dir=exp/rnnlm_lstm_1a
+text=data/train/text
+lexicon=data/local/dict_nosp/lexiconp.txt
+text_dir=data/rnnlm/text_nosp_1e
 mkdir -p $dir/config
 set -e
 
@@ -116,8 +114,8 @@ if [ $stage -le 4 ] && $run_rescore; then
     rnnlm/lmrescore$pruned.sh \
       --cmd "$decode_cmd --mem 4G" \
       --weight 0.5 --max-ngram-order $ngram_order \
-      data/$language/lang_$LM $dir \
-      data/$language/${decode_set}_hires ${decode_dir} \
+      data/lang_$LM $dir \
+      data/${decode_set}_hires ${decode_dir} \
       ${decode_dir}_${decode_dir_suffix}
   done
 fi
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index 8352e7340da..41901ee237c 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -16,32 +16,33 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 
 [ ! -f ./conf/lang/${language}.conf ] && echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
-. ./conf/lang/${language}.conf
+ln -sf ./conf/lang/${language}.conf lang.conf
+. ./lang.conf
 
 if [ $stage -le 1 ]; then
-  local/prepare_text_data.sh $corpus $language
-  local/prepare_audio_data.sh $corpus $language
+  local/prepare_text_data.sh $corpus
+  local/prepare_audio_data.sh $corpus
 fi
 
 if [ $stage -le 2 ]; then
-  local/prepare_dict.sh $corpus $language
-  utils/validate_dict_dir.pl data/$language/local/dict_nosp
-  utils/prepare_lang.sh data/$language/local/dict_nosp \
-    "<unk>" data/$language/local/lang_nosp data/$language/lang_nosp
-  utils/validate_lang.pl data/$language/lang_nosp
+  local/prepare_dict.sh $corpus
+  utils/validate_dict_dir.pl data/local/dict_nosp
+  utils/prepare_lang.sh data/local/dict_nosp \
+    "<unk>" data/local/lang_nosp data/lang_nosp
+  utils/validate_lang.pl data/lang_nosp
 fi
 
 if [ $stage -le 3 ]; then
   local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file \
-    data/$language/lang_nosp/words.txt data/$language data/$language/lm
-  utils/format_lm.sh data/$language/lang_nosp data/$language/lm/lm.gz \
-    data/$language/local/dict_nosp/lexiconp.txt data/$language/lang_nosp_test
-  utils/validate_lang.pl data/$language/lang_nosp_test
+    data/lang_nosp/words.txt data data/lm
+  utils/format_lm.sh data/lang_nosp data/lm/lm.gz \
+    data/local/dict_nosp/lexiconp.txt data/lang_nosp_test
+  utils/validate_lang.pl data/lang_nosp_test
 fi
 
 if [ $stage -le 4 ]; then
   for set in train dev; do
-    dir=data/$language/$set
+    dir=data/$set
     utils/fix_data_dir.sh $dir
     steps/make_mfcc.sh --cmd "$train_cmd" --nj 16 $dir
     steps/compute_cmvn_stats.sh $dir
@@ -52,74 +53,74 @@ fi
 
 # Create a subset with 40k short segments to make flat-start training easier
 if [ $stage -le 5 ]; then
-  utils/subset_data_dir.sh --shortest data/$language/train $numShorestUtts data/$language/train_short
+  utils/subset_data_dir.sh --shortest data/train $numShorestUtts data/train_short
 fi
 
 # monophone training
 if [ $stage -le 6 ]; then
   steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
-    data/$language/train_short data/$language/lang_nosp_test exp/$language/mono
+    data/train_short data/lang_nosp_test exp/mono
   (
-    utils/mkgraph.sh data/$language/lang_nosp_test \
-      exp/$language/mono exp/$language/mono/graph_nosp
+    utils/mkgraph.sh data/lang_nosp_test \
+      exp/mono exp/mono/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/mono/graph_nosp \
-        data/$language/$test exp/$language/mono/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/mono/graph_nosp \
+        data/$test exp/mono/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/$language/train data/$language/lang_nosp_test exp/$language/mono exp/$language/mono_ali
+    data/train data/lang_nosp_test exp/mono exp/mono_ali
 fi
 
 # train a first delta + delta-delta triphone system on all utterances
 if [ $stage -le 7 ]; then
   steps/train_deltas.sh --cmd "$train_cmd" \
-    $numLeavesTri1 $numGaussTri1 data/$language/train data/$language/lang_nosp_test exp/$language/mono_ali exp/$language/tri1
+    $numLeavesTri1 $numGaussTri1 data/train data/lang_nosp_test exp/mono_ali exp/tri1
 
   # decode using the tri1 model
   (
-    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri1 exp/$language/tri1/graph_nosp
+    utils/mkgraph.sh data/lang_nosp_test exp/tri1 exp/tri1/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri1/graph_nosp \
-        data/$language/$test exp/$language/tri1/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri1/graph_nosp \
+        data/$test exp/tri1/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh --nj $nj --cmd "$train_cmd" \
-    data/$language/train data/$language/lang_nosp_test exp/$language/tri1 exp/$language/tri1_ali
+    data/train data/lang_nosp_test exp/tri1 exp/tri1_ali
 fi
 
 # train an LDA+MLLT system.
 if [ $stage -le 8 ]; then
   steps/train_lda_mllt.sh --cmd "$train_cmd" \
     --splice-opts "--left-context=3 --right-context=3" $numLeavesTri2 $numGaussTri2 \
-    data/$language/train data/$language/lang_nosp_test exp/$language/tri1_ali exp/$language/tri2
+    data/train data/lang_nosp_test exp/tri1_ali exp/tri2
 
   # decode using the LDA+MLLT model
   (
-    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri2 exp/$language/tri2/graph_nosp
+    utils/mkgraph.sh data/lang_nosp_test exp/tri2 exp/tri2/graph_nosp
     for test in dev; do
-      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri2/graph_nosp \
-        data/$language/$test exp/$language/tri2/decode_nosp_$test
+      steps/decode.sh --nj $nj --cmd "$decode_cmd" exp/tri2/graph_nosp \
+        data/$test exp/tri2/decode_nosp_$test
     done
   )&
 
   steps/align_si.sh  --nj $nj --cmd "$train_cmd" --use-graphs true \
-    data/$language/train data/$language/lang_nosp_test exp/$language/tri2 exp/$language/tri2_ali
+    data/train data/lang_nosp_test exp/tri2 exp/tri2_ali
 fi
 
 # Train tri3, which is LDA+MLLT+SAT
 if [ $stage -le 9 ]; then
   steps/train_sat.sh --cmd "$train_cmd" $numLeavesTri3 $numGaussTri3 \
-    data/$language/train data/$language/lang_nosp_test exp/$language/tri2_ali exp/$language/tri3
+    data/train data/lang_nosp_test exp/tri2_ali exp/tri3
 
   # decode using the tri3 model
   (
-    utils/mkgraph.sh data/$language/lang_nosp_test exp/$language/tri3 exp/$language/tri3/graph_nosp
+    utils/mkgraph.sh data/lang_nosp_test exp/tri3 exp/tri3/graph_nosp
     for test in dev; do
-      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/$language/tri3/graph_nosp \
-        data/$language/$test exp/$language/tri3/decode_nosp_$test
+      steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" exp/tri3/graph_nosp \
+        data/$test exp/tri3/decode_nosp_$test
     done
   )&
 fi
@@ -127,28 +128,28 @@ fi
 # Now we compute the pronunciation and silence probabilities from training data,
 # and re-create the lang directory.
 if [ $stage -le 10 ]; then
-  steps/get_prons.sh --cmd "$train_cmd" data/$language/train data/$language/lang_nosp_test exp/$language/tri3
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_nosp_test exp/tri3
   utils/dict_dir_add_pronprobs.sh --max-normalize true \
-    data/$language/local/dict_nosp \
-    exp/$language/tri3/pron_counts_nowb.txt exp/$language/tri3/sil_counts_nowb.txt \
-    exp/$language/tri3/pron_bigram_counts_nowb.txt data/$language/local/dict
+    data/local/dict_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt data/local/dict
 
-  utils/prepare_lang.sh data/$language/local/dict "<unk>" data/$language/local/lang data/$language/lang
+  utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
-  utils/format_lm.sh data/$language/lang data/$language/lm/lm.gz \
-    data/$language/local/dict/lexiconp.txt data/$language/lang_test
+  utils/format_lm.sh data/lang data/lm/lm.gz \
+    data/local/dict/lexiconp.txt data/lang_test
 
   steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
-    data/$language/train data/$language/lang_test exp/$language/tri3 exp/$language/tri3_ali
+    data/train data/lang_test exp/tri3 exp/tri3_ali
 fi
 
 if [ $stage -le 11 ]; then
   # Test the tri3 system with the silprobs and pron-probs.
 
   # decode using the tri3 model
-  utils/mkgraph.sh data/$language/lang_test exp/$language/tri3 exp/$language/tri3/graph
+  utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
   for test in dev; do
     steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" \
-      exp/$language/tri3/graph data/$language/$test exp/$language/tri3/decode_$test
+      exp/tri3/graph data/$test exp/tri3/decode_$test
   done
 fi

From b137a696014ec573ef78e7273089f18381217b1b Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Wed, 7 Feb 2018 10:54:34 -0500
Subject: [PATCH 172/184] fix issues related to num of params checks for some
 scripts

---
 egs/material/s5/local/prepare_audio_data.sh | 2 +-
 egs/material/s5/local/prepare_dict.sh       | 2 +-
 egs/material/s5/local/prepare_text_data.sh  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/material/s5/local/prepare_audio_data.sh b/egs/material/s5/local/prepare_audio_data.sh
index 372619b63a2..2bf9283f435 100755
--- a/egs/material/s5/local/prepare_audio_data.sh
+++ b/egs/material/s5/local/prepare_audio_data.sh
@@ -8,7 +8,7 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 2 ] ; then
+if [ $# -ne 1 ] ; then
   echo "Invalid number of script parameters. "
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."
diff --git a/egs/material/s5/local/prepare_dict.sh b/egs/material/s5/local/prepare_dict.sh
index 51ac6fe13e8..d9924d14815 100755
--- a/egs/material/s5/local/prepare_dict.sh
+++ b/egs/material/s5/local/prepare_dict.sh
@@ -8,7 +8,7 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 2 ] ; then
+if [ $# -ne 1 ] ; then
   echo "Invalid number of script parameters. "
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."
diff --git a/egs/material/s5/local/prepare_text_data.sh b/egs/material/s5/local/prepare_text_data.sh
index 89d2d057d62..eaa046b202a 100755
--- a/egs/material/s5/local/prepare_text_data.sh
+++ b/egs/material/s5/local/prepare_text_data.sh
@@ -8,7 +8,7 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error
 echo "$0 " "$@"
 
-if [ $# -ne 2 ] ; then
+if [ $# -ne 1 ] ; then
   echo "Invalid number of script parameters. "
   echo "  $0 <path-to-material-corpus>"
   echo "e.g."

From a8b07e020c1a1452ed420662f63fde2d2d582535 Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@a01.clsp.jhu.edu>
Date: Thu, 25 Jan 2018 00:40:41 -0500
Subject: [PATCH 173/184] added decoding scripts for ANALYSIS1

---
 egs/material/s5/local/chain/decode_test.sh | 124 +++++++++++++++++++++
 egs/material/s5/local/postprocess_test.sh  |  37 ++++++
 egs/material/s5/local/preprocess_test.sh   |  45 ++++++++
 3 files changed, 206 insertions(+)
 create mode 100755 egs/material/s5/local/chain/decode_test.sh
 create mode 100755 egs/material/s5/local/postprocess_test.sh
 create mode 100755 egs/material/s5/local/preprocess_test.sh

diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
new file mode 100755
index 00000000000..d68828feda0
--- /dev/null
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+export LD_LIBRARY_PATH=/home/dpovey/libs
+
+# Set -e here so that we catch if any executable fails immediately
+set -euo pipefail
+
+language=swahili
+datadev="data/"$language"/analysis1"
+label_delay=5
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+test_sets="analysis1-segmented"
+nnet3_affix=
+tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
+dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
+train_set=train
+train_data_dir=data/$language/${train_set}_sp_hires                                       
+lores_train_data_dir=data/$language/${train_set}_sp
+
+decode_nj=30
+gmm=tri3
+nnet3_affix=
+
+tlstm_affix=1a   # affix for the TDNN-LSTM directory name
+tree_affix=
+
+# training options
+# training chunk-options
+chunk_width=140,100,160
+chunk_left_context=40
+chunk_right_context=0
+label_delay=5
+common_egs_dir=
+xent_regularize=0.1
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# audio segmentation
+local/preprocess_test.sh --datadev $datadev
+
+nj=30
+gmm=tri3
+test_sets="analysis1-segmented"
+
+# stage 3
+
+for datadir in $test_sets; do
+  utils/copy_data_dir.sh data/$language/$datadir data/$language/${datadir}_hires
+done
+
+for datadir in $test_sets; do
+  steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+    --cmd "$train_cmd" data/$language/${datadir}_hires || exit 1;
+  steps/compute_cmvn_stats.sh data/$language/${datadir}_hires || exit 1;
+  utils/fix_data_dir.sh data/$language/${datadir}_hires || exit 1;
+done
+
+# stage 6
+
+# extract iVectors for the test data, in this case we don't need the speed
+# perturbation (sp).
+for data in $test_sets; do
+  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
+    data/$language/${data}_hires exp/$language/nnet3${nnet3_affix}/extractor \
+    exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires
+done
+
+
+gmm_dir=exp/$language/$gmm                                                                
+ali_dir=exp/$language/${gmm}_ali_${train_set}_sp                                          
+tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
+dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp                          
+if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi                   
+train_data_dir=data/$language/${train_set}_sp_hires                                       
+lores_train_data_dir=data/$language/${train_set}_sp                                       
+train_ivector_dir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires        
+                                                                                
+for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do                       
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1                 
+done 
+
+# stage 16
+
+frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+rm $dir/.error 2>/dev/null || true
+
+for data in $test_sets; do
+  (
+    nspk=$(wc -l <data/$language/${data}_hires/spk2utt)
+    steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires \
+        $tree_dir/graph data/$language/${data}_hires ${dir}/decode_${data} || exit 1
+  ) || touch $dir/.error &
+done
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+# resolve ctm overlaping regions, and compute wer
+local/postprocess_test.sh --test_sets $test_sets --tree_dir $tree_dir \
+  --dir $dir --language $language
+
+exit 0;
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
new file mode 100755
index 00000000000..addbc2b098c
--- /dev/null
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -0,0 +1,37 @@
+#!/bin/sh
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+test_sets=$2 # TODO: fix this
+tree_dir=$4
+dir=$6
+language=$8
+
+./cmd.sh                                                                        
+./path.sh                                                                       
+./utils/parse_options.sh
+
+# get recording-level CTMs from the lattice by resolving the overlapping
+# regions
+
+for data in $test_sets; do
+  steps/cleanup/internal/get_ctm.sh --frame-shift 0.03 \
+    data/$language/${data}_hires/ $tree_dir/graph/ \
+    $dir/decode_$data/
+
+  cat $dir/decode_$data/score_10/${data}_hires.ctm.* \
+    > $dir/decode_$data/score_10/${data}_hires.ctm
+
+  awk '{print $2" "$2" 1"}' data/$language/${data}_hires/segments > \
+    data/$language/${data}_hires/reco2file_and_channel
+
+  utils/ctm/resolve_ctm_overlaps.py data/$language/${data}_hires/segments\
+    $dir/decode_$data/score_10/${data}_hires.ctm \
+    - | utils/convert_ctm.pl data/$language/${data}_hires/segments \
+    data/$language/${data}_hires/reco2file_and_channel > \
+   $dir/decode_$data/score_10/ctm_out
+done
+
+# TODO: add compute wer scripts
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
new file mode 100755
index 00000000000..8688d4f6f66
--- /dev/null
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+datadev=$2 # TODO: fix this
+audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/src/"
+
+./cmd.sh                                                                        
+./path.sh                                                                       
+./utils/parse_options.sh 
+
+mkdir -p $datadev
+
+# 1. create wav.scp, utt2spk, spk2utt files
+
+find $audio_path -name "*.wav" \
+  | while read file; do id=$(basename $file | awk '{gsub(".wav","");print}'); \
+  echo "$id sox $file -r 8000 -b 16 -c 1 -t wav - |"; done > \
+  $datadev/wav.scp
+
+awk '{print $1" "$1}' $datadev/wav.scp > $datadev/utt2spk
+
+cp $datadev/utt2spk $datadev/spk2utt
+
+utils/fix_data_dir.sh $datadev
+
+# 2. segment .wav files
+ 
+# 2.1. create a trivial segments file:
+
+utils/data/get_utt2dur.sh $datadev/
+
+utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments
+
+
+# 2.2. create uniform segmented directory using: (The durations are in seconds)
+
+utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
+--overlap-duration=5 --max-remaining-duration=15 $datadev/segments > \
+$datadev/uniform_sub_segments
+
+utils/data/subsegment_data_dir.sh $datadev/ \
+  $datadev/uniform_sub_segments $datadev-segmented

From c0074018dbb7ee1299765afd0a92d28aa30c245d Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@login3.clsp.jhu.edu>
Date: Tue, 30 Jan 2018 22:08:14 -0500
Subject: [PATCH 174/184] added scripts to compute WER for decoding ANALYSIS1

---
 egs/material/s5/local/chain/decode_test.sh  | 10 ++--
 egs/material/s5/local/postprocess_test.sh   | 13 +++--
 egs/material/s5/local/preprocess_test.sh    | 50 ++++++++++++++++---
 egs/material/s5/local/score_segments.sh     |  8 +++
 egs/material/s5/local/score_wer_segments.sh | 55 +++++++++++++++++++++
 5 files changed, 119 insertions(+), 17 deletions(-)
 create mode 100755 egs/material/s5/local/score_segments.sh
 create mode 100755 egs/material/s5/local/score_wer_segments.sh

diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index d68828feda0..b5dca8c26db 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -50,7 +50,7 @@ EOF
 fi
 
 # audio segmentation
-local/preprocess_test.sh --datadev $datadev
+local/preprocess_test.sh $datadev $language
 
 nj=30
 gmm=tri3
@@ -113,12 +113,12 @@ for data in $test_sets; do
         --online-ivector-dir exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires \
         $tree_dir/graph data/$language/${data}_hires ${dir}/decode_${data} || exit 1
   ) || touch $dir/.error &
+  
+  # resolve ctm overlaping regions, and compute wer
+  cp $datadev/reftext data/$language/${data}_hires
+  local/postprocess_test.sh $test_sets $tree_dir $dir $language
 done
 wait
 [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 
-# resolve ctm overlaping regions, and compute wer
-local/postprocess_test.sh --test_sets $test_sets --tree_dir $tree_dir \
-  --dir $dir --language $language
-
 exit 0;
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
index addbc2b098c..5717679c20a 100755
--- a/egs/material/s5/local/postprocess_test.sh
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -4,10 +4,10 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error 
 echo "$0 $@"
 
-test_sets=$2 # TODO: fix this
-tree_dir=$4
-dir=$6
-language=$8
+test_sets=$1
+tree_dir=$2
+dir=$3
+language=$4
 
 ./cmd.sh                                                                        
 ./path.sh                                                                       
@@ -32,6 +32,9 @@ for data in $test_sets; do
     - | utils/convert_ctm.pl data/$language/${data}_hires/segments \
     data/$language/${data}_hires/reco2file_and_channel > \
    $dir/decode_$data/score_10/ctm_out
+
+  # compute WER              
+  local/score_segments.sh data/$language/${data}_hires/ $dir/decode_$data
 done
 
-# TODO: add compute wer scripts
+exit 0;
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index 8688d4f6f66..1ad9a593fb8 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -4,8 +4,11 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error 
 echo "$0 $@"
 
-datadev=$2 # TODO: fix this
-audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/src/"
+datadev=$1
+language=$2
+AorB="1A" # for swahili
+if [ $language = "tagalog" ]; then AorB="1B"; fi
+audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-"$AorB"/ANALYSIS1/audio/"
 
 ./cmd.sh                                                                        
 ./path.sh                                                                       
@@ -13,9 +16,40 @@ audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/src
 
 mkdir -p $datadev
 
-# 1. create wav.scp, utt2spk, spk2utt files
+# 1. create the reference transcript $datadev/reftext
 
-find $audio_path -name "*.wav" \
+ls -d $audio_path/transcription/* > "list.tmp"
+rm -rf {zero,brac,all}.tmp
+rm -rf all.tmp.sort
+
+while read line; do
+  h=$(head -1 "$line")
+  if [[ $h == "0"* ]]
+  then                                                                     
+    # starts with 0.000"
+    echo $line | while read ref; do
+      s=${ref##*/}
+      awk '{l[NR] = $0} END {for (i=1; i<=NR-1; i++) print l[i]}' "$ref" | \
+        cut -f1,2 --complement | tr '\n' ' ' | \
+        awk '{$0="'${s%.transcription.txt}' " $0}1' >> zero.tmp;
+    done
+  else
+    # starts with [0.000]"
+    echo $line | while read ref; do
+      s=${ref##*/}
+      awk 'NR%2==0' "$ref" | tr '\n' ' ' | \
+        awk '{$0="'${s%.transcription.txt}' " $0}1'>> brac.tmp;
+    done
+  fi
+done < "list.tmp"
+
+cat zero.tmp brac.tmp > all.tmp
+mv all.tmp $datadev/reftext
+rm -rf {zero,brac,list}.tmp
+
+# 2. create wav.scp, utt2spk, spk2utt files
+
+find $audio_path/src -name "*.wav" \
   | while read file; do id=$(basename $file | awk '{gsub(".wav","");print}'); \
   echo "$id sox $file -r 8000 -b 16 -c 1 -t wav - |"; done > \
   $datadev/wav.scp
@@ -26,16 +60,16 @@ cp $datadev/utt2spk $datadev/spk2utt
 
 utils/fix_data_dir.sh $datadev
 
-# 2. segment .wav files
+# 3. segment .wav files
  
-# 2.1. create a trivial segments file:
+# 3.1. create a trivial segments file:
 
 utils/data/get_utt2dur.sh $datadev/
 
 utils/data/get_segments_for_data.sh $datadev/ > $datadev/segments
 
 
-# 2.2. create uniform segmented directory using: (The durations are in seconds)
+# 3.2. create uniform segmented directory using: (The durations are in seconds)
 
 utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
 --overlap-duration=5 --max-remaining-duration=15 $datadev/segments > \
@@ -43,3 +77,5 @@ $datadev/uniform_sub_segments
 
 utils/data/subsegment_data_dir.sh $datadev/ \
   $datadev/uniform_sub_segments $datadev-segmented
+
+exit 0;
diff --git a/egs/material/s5/local/score_segments.sh b/egs/material/s5/local/score_segments.sh
new file mode 100755
index 00000000000..064e15ae40d
--- /dev/null
+++ b/egs/material/s5/local/score_segments.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+echo "$0" "$@"
+local/score_wer_segments.sh "$@"
+#local/score_cer_segment.sh --stage 2 "$@"
+
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
new file mode 100755
index 00000000000..f61f70337da
--- /dev/null
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+data=$1
+dir=$2
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  mkdir -p $dir/scoring_kaldi/log
+  # begin building hypothesis hyp.txt
+  # in the same format as $data/reftext
+  rm -rf tmpconcat
+  awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \
+    $dir/score_10/ctm_out > tmpconcat
+  awk -F" " '{print $1}' $data/reftext > tmpreforder
+  rm -rf $dir/score_10/ctm_out.concat
+  while read LINE; do                                                             
+    grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat"
+  done < "tmpreforder"
+  rm -rf tmpconcat
+  rm -rf tmpreforder
+  $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+    $dir/scoring_kaldi/hyp.txt || exit 1;
+  #end building hypothesis hyp.txt
+    
+  $cmd $dir/scoring_kaldi/log/score.hyp.log \
+    cat $dir/scoring_kaldi/hyp.txt \| \
+    compute-wer --text --mode=present \
+    ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
+
+  cat $dir/wer
+fi
+
+exit 0;

From 133b0ec13b906f3e49a8c7483b2e5bde5d4ca0de Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@b04.clsp.jhu.edu>
Date: Sun, 4 Feb 2018 15:34:00 -0500
Subject: [PATCH 175/184] removed exit 0

---
 egs/material/s5/local/chain/decode_test.sh  | 2 ++
 egs/material/s5/local/postprocess_test.sh   | 1 -
 egs/material/s5/local/preprocess_test.sh    | 1 -
 egs/material/s5/local/score_wer_segments.sh | 1 -
 4 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index b5dca8c26db..c4710e03e03 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -5,6 +5,8 @@ export LD_LIBRARY_PATH=/home/dpovey/libs
 set -euo pipefail
 
 language=swahili
+. utils/parse_options.sh
+
 datadev="data/"$language"/analysis1"
 label_delay=5
 tlstm_affix=1a   # affix for the TDNN-LSTM directory name
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
index 5717679c20a..e4aa7c52652 100755
--- a/egs/material/s5/local/postprocess_test.sh
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -37,4 +37,3 @@ for data in $test_sets; do
   local/score_segments.sh data/$language/${data}_hires/ $dir/decode_$data
 done
 
-exit 0;
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index 1ad9a593fb8..d001ad85e48 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -78,4 +78,3 @@ $datadev/uniform_sub_segments
 utils/data/subsegment_data_dir.sh $datadev/ \
   $datadev/uniform_sub_segments $datadev-segmented
 
-exit 0;
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
index f61f70337da..6aa541c624d 100755
--- a/egs/material/s5/local/score_wer_segments.sh
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -52,4 +52,3 @@ if [ $stage -le 0 ]; then
   cat $dir/wer
 fi
 
-exit 0;

From 648df00e5320b1dfa6410b358a245f3bd40994ca Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@login3.clsp.jhu.edu>
Date: Wed, 7 Feb 2018 00:18:33 -0500
Subject: [PATCH 176/184] remove  in exp/ and data/

---
 egs/material/s5/local/chain/decode_test.sh | 49 +++++++++++-----------
 egs/material/s5/local/postprocess_test.sh  | 15 ++++---
 egs/material/s5/local/preprocess_test.sh   |  5 +--
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index c4710e03e03..0b75cb0b0a1 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -4,20 +4,19 @@ export LD_LIBRARY_PATH=/home/dpovey/libs
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
 
-language=swahili
 . utils/parse_options.sh
 
-datadev="data/"$language"/analysis1"
+datadev="data/analysis1"
 label_delay=5
 tlstm_affix=1a   # affix for the TDNN-LSTM directory name
 test_sets="analysis1-segmented"
 nnet3_affix=
-tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
-dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
 if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
 train_set=train
-train_data_dir=data/$language/${train_set}_sp_hires                                       
-lores_train_data_dir=data/$language/${train_set}_sp
+train_data_dir=data/${train_set}_sp_hires                                       
+lores_train_data_dir=data/${train_set}_sp
 
 decode_nj=30
 gmm=tri3
@@ -52,7 +51,7 @@ EOF
 fi
 
 # audio segmentation
-local/preprocess_test.sh $datadev $language
+local/preprocess_test.sh $datadev
 
 nj=30
 gmm=tri3
@@ -61,14 +60,14 @@ test_sets="analysis1-segmented"
 # stage 3
 
 for datadir in $test_sets; do
-  utils/copy_data_dir.sh data/$language/$datadir data/$language/${datadir}_hires
+  utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
 done
 
 for datadir in $test_sets; do
   steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-    --cmd "$train_cmd" data/$language/${datadir}_hires || exit 1;
-  steps/compute_cmvn_stats.sh data/$language/${datadir}_hires || exit 1;
-  utils/fix_data_dir.sh data/$language/${datadir}_hires || exit 1;
+    --cmd "$train_cmd" data/${datadir}_hires || exit 1;
+  steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
+  utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
 done
 
 # stage 6
@@ -77,19 +76,19 @@ done
 # perturbation (sp).
 for data in $test_sets; do
   steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    data/$language/${data}_hires exp/$language/nnet3${nnet3_affix}/extractor \
-    exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires
+    data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
+    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
 done
 
 
-gmm_dir=exp/$language/$gmm                                                                
-ali_dir=exp/$language/${gmm}_ali_${train_set}_sp                                          
-tree_dir=exp/$language/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
-dir=exp/$language/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp                          
+gmm_dir=exp/$gmm                                                                
+ali_dir=exp/${gmm}_ali_${train_set}_sp                                          
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
+dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp                          
 if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi                   
-train_data_dir=data/$language/${train_set}_sp_hires                                       
-lores_train_data_dir=data/$language/${train_set}_sp                                       
-train_ivector_dir=exp/$language/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires        
+train_data_dir=data/${train_set}_sp_hires                                       
+lores_train_data_dir=data/${train_set}_sp                                       
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires        
                                                                                 
 for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
     $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do                       
@@ -103,7 +102,7 @@ rm $dir/.error 2>/dev/null || true
 
 for data in $test_sets; do
   (
-    nspk=$(wc -l <data/$language/${data}_hires/spk2utt)
+    nspk=$(wc -l <data/${data}_hires/spk2utt)
     steps/nnet3/decode.sh \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --extra-left-context $chunk_left_context \
@@ -112,13 +111,13 @@ for data in $test_sets; do
         --extra-right-context-final 0 \
         --frames-per-chunk $frames_per_chunk \
         --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-        --online-ivector-dir exp/$language/nnet3${nnet3_affix}/ivectors_${data}_hires \
-        $tree_dir/graph data/$language/${data}_hires ${dir}/decode_${data} || exit 1
+        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+        $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
   ) || touch $dir/.error &
   
   # resolve ctm overlaping regions, and compute wer
-  cp $datadev/reftext data/$language/${data}_hires
-  local/postprocess_test.sh $test_sets $tree_dir $dir $language
+  cp $datadev/reftext data/${data}_hires
+  local/postprocess_test.sh $test_sets $tree_dir $dir
 done
 wait
 [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
index e4aa7c52652..3f4333347f8 100755
--- a/egs/material/s5/local/postprocess_test.sh
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -7,7 +7,6 @@ echo "$0 $@"
 test_sets=$1
 tree_dir=$2
 dir=$3
-language=$4
 
 ./cmd.sh                                                                        
 ./path.sh                                                                       
@@ -18,22 +17,22 @@ language=$4
 
 for data in $test_sets; do
   steps/cleanup/internal/get_ctm.sh --frame-shift 0.03 \
-    data/$language/${data}_hires/ $tree_dir/graph/ \
+    data/${data}_hires/ $tree_dir/graph/ \
     $dir/decode_$data/
 
   cat $dir/decode_$data/score_10/${data}_hires.ctm.* \
     > $dir/decode_$data/score_10/${data}_hires.ctm
 
-  awk '{print $2" "$2" 1"}' data/$language/${data}_hires/segments > \
-    data/$language/${data}_hires/reco2file_and_channel
+  awk '{print $2" "$2" 1"}' data/${data}_hires/segments > \
+    data/${data}_hires/reco2file_and_channel
 
-  utils/ctm/resolve_ctm_overlaps.py data/$language/${data}_hires/segments\
+  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments\
     $dir/decode_$data/score_10/${data}_hires.ctm \
-    - | utils/convert_ctm.pl data/$language/${data}_hires/segments \
-    data/$language/${data}_hires/reco2file_and_channel > \
+    - | utils/convert_ctm.pl data/${data}_hires/segments \
+    data/${data}_hires/reco2file_and_channel > \
    $dir/decode_$data/score_10/ctm_out
 
   # compute WER              
-  local/score_segments.sh data/$language/${data}_hires/ $dir/decode_$data
+  local/score_segments.sh data/${data}_hires/ $dir/decode_$data
 done
 
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index d001ad85e48..33842a55993 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -5,10 +5,7 @@ set -o nounset                              # Treat unset variables as an error
 echo "$0 $@"
 
 datadev=$1
-language=$2
-AorB="1A" # for swahili
-if [ $language = "tagalog" ]; then AorB="1B"; fi
-audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-"$AorB"/ANALYSIS1/audio/"
+audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/"
 
 ./cmd.sh                                                                        
 ./path.sh                                                                       

From 4a8d4e4e60d06198d40bbd6417ee1bfc6224e65e Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@b04.clsp.jhu.edu>
Date: Fri, 9 Feb 2018 15:11:22 -0500
Subject: [PATCH 177/184] minor fixes

---
 egs/material/s5/local/chain/decode_test.sh | 19 ++++++++-----------
 egs/material/s5/local/preprocess_test.sh   |  2 +-
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index 0b75cb0b0a1..2c1ec409c77 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
-export LD_LIBRARY_PATH=/home/dpovey/libs
-
+. ./cmd.sh                                                                      
+. ./path.sh
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
 
-. utils/parse_options.sh
+language=swahili
+. ./utils/parse_options.sh
 
 datadev="data/analysis1"
 label_delay=5
@@ -38,9 +39,9 @@ xent_regularize=0.1
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
-. ./cmd.sh
-. ./path.sh
-. ./utils/parse_options.sh
+[ ! -f ./conf/lang/${language}.conf ] && echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
+ln -sf ./conf/lang/${language}.conf lang.conf                                   
+. ./lang.conf
 
 if ! cuda-compiled; then
   cat <<EOF && exit 1
@@ -57,8 +58,6 @@ nj=30
 gmm=tri3
 test_sets="analysis1-segmented"
 
-# stage 3
-
 for datadir in $test_sets; do
   utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
 done
@@ -70,7 +69,6 @@ for datadir in $test_sets; do
   utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
 done
 
-# stage 6
 
 # extract iVectors for the test data, in this case we don't need the speed
 # perturbation (sp).
@@ -95,7 +93,6 @@ for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector
   [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1                 
 done 
 
-# stage 16
 
 frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
 rm $dir/.error 2>/dev/null || true
@@ -114,7 +111,7 @@ for data in $test_sets; do
         --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
         $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
   ) || touch $dir/.error &
-  
+  wait 
   # resolve ctm overlaping regions, and compute wer
   cp $datadev/reftext data/${data}_hires
   local/postprocess_test.sh $test_sets $tree_dir $dir
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index 33842a55993..d14e54b1f2d 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -5,11 +5,11 @@ set -o nounset                              # Treat unset variables as an error
 echo "$0 $@"
 
 datadev=$1
-audio_path="/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/"
 
 ./cmd.sh                                                                        
 ./path.sh                                                                       
 ./utils/parse_options.sh 
+. ./lang.conf
 
 mkdir -p $datadev
 

From 73ca8cb43475d2f43789b42f79c78226b12dd562 Mon Sep 17 00:00:00 2001
From: Mahsa Yarmohammadi <mahsay@login3.clsp.jhu.edu>
Date: Tue, 13 Feb 2018 10:07:44 -0500
Subject: [PATCH 178/184] added audio_path to conf

---
 egs/material/s5/conf/lang/swahili.conf | 2 ++
 egs/material/s5/conf/lang/tagalog.conf | 2 ++
 2 files changed, 4 insertions(+)
 mode change 100644 => 100755 egs/material/s5/conf/lang/swahili.conf

diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
old mode 100644
new mode 100755
index d317a51148e..14ace673664
--- a/egs/material/s5/conf/lang/swahili.conf
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -1,5 +1,7 @@
 #speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
+#test audio files to decode
+audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
 
 # Acoustic model parameters
 numShorestUtts=40000
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
index ef622e941f9..3fc930b02bc 100644
--- a/egs/material/s5/conf/lang/tagalog.conf
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -1,5 +1,7 @@
 #speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
+#test audio files to decode                                                     
+audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
 
 # Acoustic model parameters
 numShorestUtts=45000

From 146162ed645ea34fbf1e18464860a67fd2877787 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Fri, 9 Feb 2018 16:20:29 -0500
Subject: [PATCH 179/184] added scripts that produce the results for the site
 visit and cleanup

---
 egs/material/s5/conf/lang/swahili.conf        |   6 +-
 egs/material/s5/conf/lang/tagalog.conf        |   6 +-
 egs/material/s5/local/chain/decode_test.sh    | 212 +++++++++-----
 egs/material/s5/local/chain/run_tdnn.sh       |   1 +
 .../s5/local/chain/tuning/run_tdnn_1a.sh      | 277 ++++++++++++++++++
 .../s5/local/chain/tuning/run_tdnn_lstm_1a.sh |  28 +-
 egs/material/s5/local/count_oovs.pl           |  81 +++++
 egs/material/s5/local/g2p/apply_g2p.sh        |  32 ++
 egs/material/s5/local/g2p/train_g2p.sh        |  67 +++++
 egs/material/s5/local/postprocess_test.sh     |  43 ++-
 egs/material/s5/local/preprocess_bitext.sh    |  33 +++
 egs/material/s5/local/preprocess_test.sh      |   8 +-
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh  |  92 +++---
 egs/material/s5/local/score_wer_segments.sh   |  22 ++
 egs/material/s5/local/wer_output_filter       |   2 +-
 egs/material/s5/rnnlm                         |   1 +
 egs/material/s5/run.sh                        |  96 +++++-
 17 files changed, 853 insertions(+), 154 deletions(-)
 create mode 120000 egs/material/s5/local/chain/run_tdnn.sh
 create mode 100755 egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
 create mode 100755 egs/material/s5/local/count_oovs.pl
 create mode 100755 egs/material/s5/local/g2p/apply_g2p.sh
 create mode 100755 egs/material/s5/local/g2p/train_g2p.sh
 create mode 100755 egs/material/s5/local/preprocess_bitext.sh
 create mode 120000 egs/material/s5/rnnlm

diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
index 14ace673664..f8725db4687 100755
--- a/egs/material/s5/conf/lang/swahili.conf
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -1,7 +1,9 @@
-#speech corpora files location
+# speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
-#test audio files to decode
+# test audio files to decode
 audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
 
 # Acoustic model parameters
 numShorestUtts=40000
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
index 3fc930b02bc..bca63a1ba3e 100644
--- a/egs/material/s5/conf/lang/tagalog.conf
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -1,7 +1,9 @@
-#speech corpora files location
+# speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
-#test audio files to decode                                                     
+# test audio files to decode
 audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+# bitext file location
+bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
 
 # Acoustic model parameters
 numShorestUtts=45000
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index 2c1ec409c77..e3e8bc7cdfd 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -1,44 +1,41 @@
 #!/bin/bash
-. ./cmd.sh                                                                      
-. ./path.sh
+
+# Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+#           2018  Mahsa Yarmohammadi
+#           2018  Yiming Wang
+
+
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
 
 language=swahili
-. ./utils/parse_options.sh
-
+stage=0
 datadev="data/analysis1"
-label_delay=5
-tlstm_affix=1a   # affix for the TDNN-LSTM directory name
-test_sets="analysis1-segmented"
-nnet3_affix=
-tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
-if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
-train_set=train
-train_data_dir=data/${train_set}_sp_hires                                       
-lores_train_data_dir=data/${train_set}_sp
-
-decode_nj=30
-gmm=tri3
-nnet3_affix=
-
-tlstm_affix=1a   # affix for the TDNN-LSTM directory name
-tree_affix=
+dir=exp/chain/tdnn_lstm1a_sp
+lang=data/lang_chain
+tree_dir=exp/chain/tree_sp
+cmd=queue.pl
 
 # training options
-# training chunk-options
 chunk_width=140,100,160
 chunk_left_context=40
 chunk_right_context=0
-label_delay=5
-common_egs_dir=
-xent_regularize=0.1
 
+# ivector options
+max_count=75 # parameter for extract_ivectors.sh
+sub_speaker_frames=600
+filter_ctm=true
+weights_file=
+silence_weight=0.00001
+nj=30
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
 [ ! -f ./conf/lang/${language}.conf ] && echo "Language configuration conf/lang/${language}.conf does not exist!" && exit 1
 ln -sf ./conf/lang/${language}.conf lang.conf                                   
 . ./lang.conf
@@ -51,71 +48,132 @@ where "nvcc" is installed.
 EOF
 fi
 
-# audio segmentation
-local/preprocess_test.sh $datadev
+if [ $stage -le 0 ]; then
+  # audio segmentation: uniformly
+  for datadir in $datadev; do
+    local/preprocess_test.sh $datadir
+  done
+fi
 
-nj=30
-gmm=tri3
-test_sets="analysis1-segmented"
-
-for datadir in $test_sets; do
-  utils/copy_data_dir.sh data/$datadir data/${datadir}_hires
-done
-
-for datadir in $test_sets; do
-  steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
-    --cmd "$train_cmd" data/${datadir}_hires || exit 1;
-  steps/compute_cmvn_stats.sh data/${datadir}_hires || exit 1;
-  utils/fix_data_dir.sh data/${datadir}_hires || exit 1;
-done
-
-
-# extract iVectors for the test data, in this case we don't need the speed
-# perturbation (sp).
-for data in $test_sets; do
-  steps/online/nnet2/extract_ivectors_online.sh --cmd "$train_cmd" --nj $nj \
-    data/${data}_hires exp/nnet3${nnet3_affix}/extractor \
-    exp/nnet3${nnet3_affix}/ivectors_${data}_hires
-done
-
-
-gmm_dir=exp/$gmm                                                                
-ali_dir=exp/${gmm}_ali_${train_set}_sp                                          
-tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}             
-dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp                          
-if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi                   
-train_data_dir=data/${train_set}_sp_hires                                       
-lores_train_data_dir=data/${train_set}_sp                                       
-train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires        
-                                                                                
-for f in $gmm_dir/final.mdl $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
-    $lores_train_data_dir/feats.scp $ali_dir/ali.1.gz; do                       
-  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1                 
-done 
+if [ $stage -le 1 ]; then
+  # extract hires mfcc features from uniformly segmented data
+  for datadir in $datadev; do
+    utils/copy_data_dir.sh ${datadir}_segmented ${datadir}_segmented_hires
+    steps/make_mfcc.sh --nj $nj --mfcc-config conf/mfcc_hires.conf \
+      --cmd "$train_cmd" ${datadir}_segmented_hires || exit 1;
+    steps/compute_cmvn_stats.sh ${datadir}_segmented_hires || exit 1;
+    utils/fix_data_dir.sh ${datadir}_segmented_hires || exit 1;
+  done
+fi
 
+if [ $stage -le 3 ]; then
+  # extract iVectors for the test data, in this case we don't need the speed
+  # perturbation (sp).
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_hires
+  done
+fi
 
 frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
 rm $dir/.error 2>/dev/null || true
 
-for data in $test_sets; do
-  (
-    nspk=$(wc -l <data/${data}_hires/spk2utt)
-    steps/nnet3/decode.sh \
+if [ $stage -le 4 ]; then
+  # do the 1st pass decoding
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented
+      steps/nnet3/decode.sh \
         --acwt 1.0 --post-decode-acwt 10.0 \
         --extra-left-context $chunk_left_context \
         --extra-right-context $chunk_right_context \
         --extra-left-context-initial 0 \
         --extra-right-context-final 0 \
         --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
         --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
-        --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-        $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
-  ) || touch $dir/.error &
-  wait 
-  # resolve ctm overlaping regions, and compute wer
-  cp $datadev/reftext data/${data}_hires
-  local/postprocess_test.sh $test_sets $tree_dir $dir
-done
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_hires \
+        $tree_dir/graph_combined ${datadir}_segmented_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      cp ${datadir}/reftext ${datadir}_segmented_hires
+      local/postprocess_test.sh ${data}_segmented ${tree_dir}/graph_combined \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+
+if [ $stage -le 5 ]; then
+  # re-segement data based on 1st-pass decoding
+  segmentation_opts="--silence-proportion 0.2 --max-segment-length 15 --frame-shift 0.03"
+  for datadir in $datadev; do
+    data=$(basename $datadir)
+    # get alignment from lattice
+    nj_ali=`cat ${dir}/decode_${data}_segmented/num_jobs` || exit 1;
+    $cmd JOB=1:${nj_ali} ${dir}/decode_${data}_segmented/log/generate_alignments.JOB.log \
+    lattice-best-path --acoustic-scale=0.2 \
+    "ark:gunzip -c ${dir}/decode_${data}_segmented/lat.JOB.gz |" \
+    ark:/dev/null "ark:|gzip -c >${dir}/decode_${data}_segmented/ali.JOB.gz" || exit 1;
+
+    cp $lang/phones.txt ${dir}/decode_${data}_segmented || exit 1;
+
+    steps/resegment_data.sh --segmentation-opts "$segmentation_opts" ${datadir}_segmented_hires $lang \
+      ${dir}/decode_${data}_segmented ${datadir}_segmented_reseg_hires_tmp exp/resegment_${data}_segmented
+
+    utils/data/subsegment_data_dir.sh ${datadir}_segmented_hires ${datadir}_segmented_reseg_hires_tmp/segments \
+      ${datadir}_segmented_reseg_hires
+
+    rm -rf ${datadir}_segmented_reseg_hires_tmp 2>/dev/null || true
+
+    echo "Extracting i-vectors, stage 2"
+    # this does offline decoding, except we estimate the iVectors per
+    # speaker, excluding silence (based on alignments from a DNN decoding), with a
+    # different script.  This is just to demonstrate that script.
+    # the --sub-speaker-frames is optional; if provided, it will divide each speaker
+    # up into "sub-speakers" of at least that many frames... can be useful if
+    # acoustic conditions drift over time within the speaker's data.
+    steps/online/nnet2/extract_ivectors.sh --cmd "$train_cmd" --nj $nj \
+      --silence-weight $silence_weight \
+      --sub-speaker-frames $sub_speaker_frames --max-count $max_count \
+      ${datadir}_segmented_reseg_hires $lang exp/nnet3/extractor \
+      exp/nnet3/ivectors_${data}_segmented_reseg_hires;
+  done
+fi
+
+if [ $stage -le 8 ]; then
+  # 2nd-pass decoding on the resegmented data
+  for datadir in $datadev; do
+    (
+      data=$(basename $datadir)
+      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      decode_dir=${dir}/decode_${data}_segmented_reseg
+      steps/nnet3/decode.sh \
+        --acwt 1.0 --post-decode-acwt 10.0 \
+        --extra-left-context $chunk_left_context \
+        --extra-right-context $chunk_right_context \
+        --extra-left-context-initial 0 \
+        --extra-right-context-final 0 \
+        --frames-per-chunk $frames_per_chunk \
+        --skip-scoring true \
+        --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+        --online-ivector-dir exp/nnet3/ivectors_${data}_segmented_reseg_hires \
+        $tree_dir/graph_combined ${datadir}_segmented_reseg_hires ${decode_dir} || exit 1
+
+      # resolve ctm overlaping regions, and compute wer
+      cp ${datadir}/reftext ${datadir}_segmented_reseg_hires
+      local/postprocess_test.sh ${data}_segmented_reseg $tree_dir/graph_combined \
+        ${decode_dir}
+    ) || touch $dir/.error &
+  done
+fi
 wait
 [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 
diff --git a/egs/material/s5/local/chain/run_tdnn.sh b/egs/material/s5/local/chain/run_tdnn.sh
new file mode 120000
index 00000000000..34499362831
--- /dev/null
+++ b/egs/material/s5/local/chain/run_tdnn.sh
@@ -0,0 +1 @@
+tuning/run_tdnn_1a.sh
\ No newline at end of file
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
new file mode 100755
index 00000000000..7d59bd08dcf
--- /dev/null
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -0,0 +1,277 @@
+#!/bin/bash
+
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
+# 1a is trying an architecture with factored parameter matrices with dropout.
+
+# [for swahili]
+# cat exp/chain/tdnn1a_sp/decode_dev/scoring_kaldi/best_wer
+# %WER 38.65 [ 24021 / 62144, 3044 ins, 6378 del, 14599 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.5
+
+# steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# exp/chain/tdnn1a_sp: num-iters=99 nj=2..12 num-params=12.2M dim=40+100->1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206)
+
+set -e -o pipefail
+
+# First the options that are passed through to run_ivector_common.sh
+# (some of which are also used in this script directly).
+stage=0
+nj=30
+train_set=train
+test_sets="dev"
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                 # should have alignments for the specified training data.
+nnet3_affix=       # affix for exp dirs, e.g. it was _cleaned in tedlium.
+
+# Options which are not passed through to run_ivector_common.sh
+affix=1a   #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+tree_affix=
+common_egs_dir=
+reporting_email=
+
+# LSTM/chain options
+train_stage=-10
+get_egs_stage=-10
+xent_regularize=0.1
+
+# training chunk-options
+chunk_width=140,100,160
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+dropout_schedule='0,0@0.20,0.3@0.50,0'
+num_epochs=7
+
+# training options
+srand=0
+remove_egs=true
+
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+local/nnet3/run_ivector_common.sh \
+  --stage $stage --nj $nj \
+  --train-set $train_set --gmm $gmm \
+  --nnet3-affix "$nnet3_affix" || exit 1;
+
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${gmm}_ali_${train_set}_sp
+tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
+lang=data/lang_chain
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
+dir=exp/chain${nnet3_affix}/tdnn${affix}_sp
+train_data_dir=data/${train_set}_sp_hires
+train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
+lores_train_data_dir=data/${train_set}_sp
+
+for f in $train_data_dir/feats.scp $train_ivector_dir/ivector_online.scp \
+    $lores_train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+if [ $stage -le 7 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+
+    lang_combined=data/lang_combined_chain
+    rm -rf ${lang_combined} 2>/dev/null || true
+    cp -r data/lang_combined_test $lang_combined
+    rm -f ${lang_combined}/topo 2>/dev/null || true
+    cp $lang/topo $lang_combined
+  fi
+fi
+
+if [ $stage -le 8 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \
+    data/lang $gmm_dir $lat_dir
+  rm $lat_dir/fsts.*.gz # save space
+fi
+
+if [ $stage -le 9 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor 3 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$train_cmd" 6000 ${lores_train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 10 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.01 dropout-per-dim=true dropout-per-dim-continuous=true"
+  linear_opts="orthonormal-constraint=1.0"
+  output_opts="l2-regularize=0.005"
+
+
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=100 name=ivector
+  input dim=40 name=input
+
+
+  # please note that it is important to have input layer with the name=input
+  # as the layer immediately preceding the fixed-affine-layer to enable
+  # the use of short notation for the descriptor
+  fixed-affine-layer name=lda input=Append(-1,0,1,ReplaceIndex(ivector, t, 0)) affine-transform-file=$dir/configs/lda.mat
+
+  # the first splicing is moved before the lda layer, so no splicing here
+  relu-batchnorm-dropout-layer name=tdnn1 $opts dim=768
+  linear-component name=tdnn2l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn2 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn3l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn3 $opts dim=768
+  linear-component name=tdnn4l dim=256 $linear_opts input=Append(-1,0)
+  relu-batchnorm-dropout-layer name=tdnn4 $opts input=Append(0,1) dim=768
+  linear-component name=tdnn5l dim=256 $linear_opts
+  relu-batchnorm-dropout-layer name=tdnn5 $opts dim=768 input=Append(0, tdnn3l)
+  linear-component name=tdnn6l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn6 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn7l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn7 $opts input=Append(0,3,tdnn6l,tdnn4l,tdnn2l) dim=768
+  linear-component name=tdnn8l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn8 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn9l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn9 $opts input=Append(0,3,tdnn8l,tdnn6l,tdnn5l) dim=768
+  linear-component name=tdnn10l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn10 $opts input=Append(0,3) dim=1024
+  linear-component name=tdnn11l dim=256 $linear_opts input=Append(-3,0)
+  relu-batchnorm-dropout-layer name=tdnn11 $opts input=Append(0,3,tdnn10l,tdnn9l,tdnn7l) dim=768
+  linear-component name=prefinal-l dim=256 $linear_opts
+
+  relu-batchnorm-layer name=prefinal-chain input=prefinal-l $opts dim=1024
+  output-layer name=output include-log-softmax=false dim=$num_targets bottleneck-dim=256 $output_opts
+
+  relu-batchnorm-layer name=prefinal-xent input=prefinal-l $opts dim=1024
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor bottleneck-dim=256 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 11 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/material-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$decode_cmd" \
+    --feat.online-ivector-dir=$train_ivector_dir \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.0 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=2000" \
+    --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=$num_epochs \
+    --trainer.frames-per-iter=1500000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=12 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.num-chunk-per-minibatch=128,64 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=0 \
+    --egs.chunk-right-context=0 \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 12 ]; then
+  # Note: it's not important to give mkgraph.sh the lang directory with the
+  # matched topology (since it gets the topology file from the model).
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_test \
+    $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  rm $dir/.error 2>/dev/null || true
+
+  for data in $test_sets; do
+    (
+      nspk=$(wc -l <data/${data}_hires/spk2utt)  
+      steps/nnet3/decode.sh \
+          --acwt 1.0 --post-decode-acwt 10.0 \
+          --extra-left-context 0 --extra-right-context 0 \
+          --extra-left-context-initial 0 \
+          --extra-right-context-final 0 \
+          --frames-per-chunk $frames_per_chunk \
+          --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
+          --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
+    ) || touch $dir/.error &
+  done
+  wait
+  [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
+fi
+
+exit 0;
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
index 77bcd1b564d..b7180ea0a68 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_lstm_1a.sh
@@ -1,11 +1,15 @@
 #!/bin/bash
 
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#           2017-2018  Yiming Wang
+
 # tdnn-lstm recipe
-# cat exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/scoring_kaldi/best_wer
-# %WER 39.27 [ 24406 / 62144, 2835 ins, 6472 del, 15099 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_1.0
+# [for swahili]
+# cat exp/chain/tdnn_lstm1a_sp/decode_dev/scoring_kaldi/best_wer
+# %WER 39.12 [ 24312 / 62144, 3118 ins, 5952 del, 15242 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev/wer_9_0.5
 
-# steps/info/chain_dir_info.pl exp/swahili/chain/tdnn_lstm1a_sp_ld5
-# exp/swahili/chain/tdnn_lstm1a_sp_ld5: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.175->-0.170 (over 3) xent:train/valid[45,69,final]=(-1.73,-1.51,-1.49/-1.89,-1.76,-1.74) logprob:train/valid[45,69,final]=(-0.187,-0.159,-0.158/-0.219,-0.210,-0.209)
+# steps/info/chain_dir_info.pl exp/chain/tdnn_lstm1a_sp
+# exp/chain/tdnn_lstm1a_sp: num-iters=70 nj=2..12 num-params=10.9M dim=40+100->1792 combine=-0.176->-0.174 (over 6) xent:train/valid[45,69,final]=(-1.71,-1.52,-1.50/-1.81,-1.69,-1.67) logprob:train/valid[45,69,final]=(-0.185,-0.160,-0.159/-0.213,-0.208,-0.205)
 
 # Set -e here so that we catch if any executable fails immediately
 set -euo pipefail
@@ -41,9 +45,6 @@ srand=0
 remove_egs=true
 reporting_email=
 
-#decode options
-test_online_decoding=true  # if true, it will run the last decoding stage.
-
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -76,7 +77,6 @@ tree_dir=exp/chain${nnet3_affix}/tree_sp${tree_affix:+_$tree_affix}
 lang=data/lang_chain
 lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats
 dir=exp/chain${nnet3_affix}/tdnn_lstm${tlstm_affix}_sp
-if [ $label_delay -gt 0 ]; then dir=${dir}_ld$label_delay; fi
 train_data_dir=data/${train_set}_sp_hires
 lores_train_data_dir=data/${train_set}_sp
 train_ivector_dir=exp/nnet3${nnet3_affix}/ivectors_${train_set}_sp_hires
@@ -106,6 +106,12 @@ if [ $stage -le 7 ]; then
     # Use our special topology... note that later on may have to tune this
     # topology.
     steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+
+    lang_combined=data/lang_combined_chain
+    rm -rf ${lang_combined} 2>/dev/null || true
+    cp -r data/lang_combined_test $lang_combined
+    rm -f ${lang_combined}/topo 2>/dev/null || true
+    cp $lang/topo $lang_combined
   fi
 fi
 
@@ -236,6 +242,10 @@ if [ $stage -le 12 ]; then
   utils/mkgraph.sh \
     --self-loop-scale 1.0 data/lang_test \
     $tree_dir $tree_dir/graph || exit 1;
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/lang_combined_test \
+    $tree_dir ${tree_dir}/graph_combined || exit 1;
 fi
 
 if [ $stage -le 13 ]; then
@@ -254,7 +264,7 @@ if [ $stage -le 13 ]; then
           --frames-per-chunk $frames_per_chunk \
           --nj $nspk --cmd "$decode_cmd"  --num-threads 4 \
           --online-ivector-dir exp/nnet3${nnet3_affix}/ivectors_${data}_hires \
-          $tree_dir/graph data/${data}_hires ${dir}/decode_${data} || exit 1
+          $tree_dir/graph_combined data/${data}_hires ${dir}/decode_${data} || exit 1
     ) || touch $dir/.error &
   done
   wait
diff --git a/egs/material/s5/local/count_oovs.pl b/egs/material/s5/local/count_oovs.pl
new file mode 100755
index 00000000000..228399f99e3
--- /dev/null
+++ b/egs/material/s5/local/count_oovs.pl
@@ -0,0 +1,81 @@
+#!/usr/bin/perl -W
+
+# (c) 2014  Korbinian Riedhammer
+
+# Count the number of OOV per turn (or speaker, if utt2spk is provided).  Use
+# the --split-words option to split non-ascii words into characters (syllable
+# based languages).
+
+
+use strict;
+use warnings;
+use Getopt::Long;
+use open qw(:std :utf8);
+
+
+my $utt2spkf = "";
+my $split_words = 0;
+
+GetOptions(
+	'utt2spk=s' => \$utt2spkf,
+	'split-words' => \$split_words
+);
+
+if (scalar @ARGV lt 1) {
+	print STDERR "usage:  $0 [--utt2spk=utt2spk] words.txt [input]\n";
+	exit 1;
+}
+
+my $lexf = shift @ARGV;
+
+my %lex = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $lexf`;
+
+my %utt2spk = ();
+if (length $utt2spkf gt 0) {
+	%utt2spk = map { my ($a, $b) = split /\s+/; $a => $b; } `cat $utt2spkf`; #read_file($utt2spkf, binmode => ':utf8');
+}
+
+my %num_words = ();
+my %num_oovs = ();
+my %oov_string = ();
+
+while (<>) {
+	my ($id, @trl) = split /\s+/;
+
+	if (length $utt2spkf gt 0) {
+		if (defined $utt2spk{$id}) {
+			$id = $utt2spk{$id};
+		} else {
+			printf STDERR "Warning: $id not specified in $utt2spkf\n";
+		}
+	}
+
+	$num_words{$id} = 0 unless defined $num_words{$id};
+	$num_oovs{$id} = 0 unless defined $num_oovs{$id};
+	$oov_string{$id} = ""  unless defined $oov_string{$id};
+
+
+	if ($split_words) {
+		for (my $i = 0; $i < scalar @trl; $i++) {
+			my $w = $trl[$i];
+			unless ($w =~ m/[a-zA-Z_\-]/) {
+				my @sw = split //, $w;
+				splice @trl, $i, 1, @sw;
+				$i += (scalar @sw) - 1;
+			}
+		}
+	}
+
+	$num_words{$id} += scalar @trl;
+	for my $w (@trl) {
+		$num_oovs{$id} += 1 unless defined $lex{$w};
+		$oov_string{$id} .= "$w " unless defined $lex{$w};
+	}
+
+}
+
+for my $i (sort keys %num_words) {
+	printf "%s %d %d %s\n", $i, $num_words{$i}, $num_oovs{$i}, 
+		( defined $oov_string{$i} ? $oov_string{$i} : "");
+}
+
diff --git a/egs/material/s5/local/g2p/apply_g2p.sh b/egs/material/s5/local/g2p/apply_g2p.sh
new file mode 100755
index 00000000000..704a1a906bb
--- /dev/null
+++ b/egs/material/s5/local/g2p/apply_g2p.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+# Copyright 2016  Allen Guo
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script applies a trained Phonetisarus G2P model to
+# synthesize pronunciations for missing words (i.e., words in
+# transcripts but not the lexicon), and output the expanded lexicon.
+
+var_counts=1
+
+. ./path.sh || exit 1
+. parse_options.sh || exit 1;
+
+if [ $# -ne "4" ]; then
+  echo "Usage: $0 <g2p-model> <g2p-tmp-dir> <current-lexicon> <output-lexicon>"
+  exit 1
+fi
+
+model=$1
+workdir=$2
+lexicon=$3
+outlexicon=$4
+
+mkdir -p $workdir
+
+echo 'Synthesizing pronunciations for missing words...'
+phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt 
+
+echo "Adding new pronunciations to $lexicon"
+cat "$lexicon" $workdir/missing_g2p_${var_counts}.txt | sort | uniq > $outlexicon
diff --git a/egs/material/s5/local/g2p/train_g2p.sh b/egs/material/s5/local/g2p/train_g2p.sh
new file mode 100755
index 00000000000..43e75f6608d
--- /dev/null
+++ b/egs/material/s5/local/g2p/train_g2p.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# Copyright 2017  Intellisist, Inc. (Author: Navneeth K)
+#           2017  Xiaohui Zhang
+# Apache License 2.0
+
+# This script trains a g2p model using Phonetisaurus and SRILM.
+
+stage=0
+silence_phones=
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. utils/parse_options.sh || exit 1;
+
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 <dictdir> <outdir>"
+  exit 1;
+fi
+
+lexicondir=$1
+outdir=$2
+
+[ ! -f $lexicondir/lexicon.txt ] && echo "Cannot find $lexicondir/lexicon.txt" && exit
+
+isuconv=`which uconv`
+if [ -z $isuconv ]; then
+  echo "uconv was not found. You must install the icu4c package."
+  exit 1;
+fi
+
+mkdir -p $outdir
+
+
+# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
+# and optionally remove words that are mapped to a single silence phone from the lexicon.
+if [ $stage -le 0 ]; then
+  lexicon=$lexicondir/lexicon.txt
+  if [ ! -z "$silence_phones" ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
+      $silence_phones $lexicon | \
+      awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  else
+    awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
+      uconv -f utf-8  -t utf-8 -x Any-NFC - | awk 'NF > 0'> $outdir/lexicon_tab_separated.txt
+  fi
+fi
+
+if [ $stage -le 1 ]; then
+  # Align lexicon stage. Lexicon is assumed to have first column tab separated
+  phonetisaurus-align --input=$outdir/lexicon_tab_separated.txt --ofile=${outdir}/aligned_lexicon.corpus || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  # Convert aligned lexicon to arpa using srilm.
+  ngram-count -order 7 -kn-modify-counts-at-end -gt1min 0 -gt2min 0 \
+    -gt3min 0 -gt4min 0 -gt5min 0 -gt6min 0 -gt7min 0 -ukndiscount \
+    -text ${outdir}/aligned_lexicon.corpus -lm ${outdir}/aligned_lexicon.arpa
+fi
+
+if [ $stage -le 3 ]; then
+  # Convert the arpa file to FST.
+  phonetisaurus-arpa2wfst --lm=${outdir}/aligned_lexicon.arpa --ofile=${outdir}/model.fst
+fi
diff --git a/egs/material/s5/local/postprocess_test.sh b/egs/material/s5/local/postprocess_test.sh
index 3f4333347f8..bdd16355f24 100755
--- a/egs/material/s5/local/postprocess_test.sh
+++ b/egs/material/s5/local/postprocess_test.sh
@@ -4,35 +4,34 @@ set -e -o pipefail
 set -o nounset                              # Treat unset variables as an error 
 echo "$0 $@"
 
-test_sets=$1
-tree_dir=$2
-dir=$3
+data=$1
+graph_dir=$2
+decode_dir=$3
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
 
-./cmd.sh                                                                        
-./path.sh                                                                       
-./utils/parse_options.sh
 
 # get recording-level CTMs from the lattice by resolving the overlapping
 # regions
 
-for data in $test_sets; do
-  steps/cleanup/internal/get_ctm.sh --frame-shift 0.03 \
-    data/${data}_hires/ $tree_dir/graph/ \
-    $dir/decode_$data/
+steps/get_ctm_fast.sh --frame-shift 0.03 \
+  data/${data}_hires/ ${graph_dir} \
+  ${decode_dir} ${decode_dir}/score_10/
 
-  cat $dir/decode_$data/score_10/${data}_hires.ctm.* \
-    > $dir/decode_$data/score_10/${data}_hires.ctm
+cat ${decode_dir}/score_10/ctm.* \
+  > ${decode_dir}/score_10/ctm
 
-  awk '{print $2" "$2" 1"}' data/${data}_hires/segments > \
-    data/${data}_hires/reco2file_and_channel
+awk '{print $2" "$2" 1"}' data/${data}_hires/segments > \
+  data/${data}_hires/reco2file_and_channel
 
-  utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments\
-    $dir/decode_$data/score_10/${data}_hires.ctm \
-    - | utils/convert_ctm.pl data/${data}_hires/segments \
-    data/${data}_hires/reco2file_and_channel > \
-   $dir/decode_$data/score_10/ctm_out
+utils/ctm/resolve_ctm_overlaps.py data/${data}_hires/segments \
+  ${decode_dir}/score_10/ctm \
+  - | utils/convert_ctm.pl data/${data}_hires/segments \
+  data/${data}_hires/reco2file_and_channel > \
+  ${decode_dir}/score_10/ctm_out
 
-  # compute WER              
-  local/score_segments.sh data/${data}_hires/ $dir/decode_$data
-done
+# compute WER              
+local/score_segments.sh data/${data}_hires/ ${decode_dir}
 
diff --git a/egs/material/s5/local/preprocess_bitext.sh b/egs/material/s5/local/preprocess_bitext.sh
new file mode 100755
index 00000000000..8aff4b50e8b
--- /dev/null
+++ b/egs/material/s5/local/preprocess_bitext.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -euo pipefail
+set -e -o pipefail                                                              
+set -o nounset                              # Treat unset variables as an error 
+echo "$0 $@"
+
+language=swahili
+srctext_bitext=data/bitext/text
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+set -e -o pipefail
+set -o nounset                              # Treat unset variables as an error
+
+language_affix=sw
+if [ "$language" == "tagalog" ]; then language_affix="tl"; fi
+MOSES=/home/pkoehn/moses
+SOURCE_TC_MODEL=/home/pkoehn/experiment/material-${language_affix}-en/truecaser/truecase-model.1.${language_affix}
+
+# Normalize punctuation and tokenize input
+$MOSES/scripts/tokenizer/normalize-punctuation.perl ${language_affix} < ${srctext_bitext} \
+ | $MOSES/scripts/tokenizer/tokenizer.perl -a -l ${language_affix} > ${srctext_bitext}.tok
+
+# Truecase
+$MOSES/scripts/recaser/truecase.perl -model $SOURCE_TC_MODEL \
+  < ${srctext_bitext}.tok > ${srctext_bitext}.tc
+
+# Remove punctuation
+cat ${srctext_bitext}.tc | sed 's/&apos; //g' | sed 's/&apos//g' | sed 's/&#91//g' | sed 's/&#93//g' | sed 's/&quot; //g' | sed 's/&quot //g' | sed 's/&amp; //g' | sed 's/@-@ //g' | sed 's/-//g' | sed 's/://g' | sed 's/\///g' | sed 's/%//g' | sed 's/+//g' | sed 's/( //g' | sed 's/) //g' | sed 's/\, //g' | sed 's/ \.//g' | sed 's/\?//g' | sed 's/\!//g' | sed 's/\;//g'
+
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index d14e54b1f2d..9e6b9ef08f2 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -6,9 +6,9 @@ echo "$0 $@"
 
 datadev=$1
 
-./cmd.sh                                                                        
-./path.sh                                                                       
-./utils/parse_options.sh 
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
 . ./lang.conf
 
 mkdir -p $datadev
@@ -73,5 +73,5 @@ utils/data/get_uniform_subsegments.py --max-segment-duration=30 \
 $datadev/uniform_sub_segments
 
 utils/data/subsegment_data_dir.sh $datadev/ \
-  $datadev/uniform_sub_segments $datadev-segmented
+  $datadev/uniform_sub_segments ${datadev}_segmented
 
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
index c6ac6842694..619c3947676 100755
--- a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -1,19 +1,19 @@
 #!/bin/bash
 
-# Copyright 2012  Johns Hopkins University (author: Daniel Povey)
-#           2017  Hainan Xu
-
-# This script trains LMs on the swbd LM-training data.
+# Copyright 2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2017  Hainan Xu
+#                2018  Ke Li
+#                2018  Yiming Wang
 
 
 # [for swahili]
-# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 3, linking it to final iteration.
-# rnnlm/train_rnnlm.sh: train/dev perplexity was 113.6 / 222.1.                   
-# Train objf: -5.79 -5.37 -5.00 -4.72 -4.47 -4.18 -3.92 -3.68 -3.48 -3.28         
-# Dev objf:   -9.99 -5.79 -5.47 -5.40 -5.46 -5.62 -5.82 -6.01 -6.22 -6.42  
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 5, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 58.3 / 275.9.
+# Train objf: -5.55 -4.75 -4.47 -4.29 -4.16 -4.04 -3.94 -3.84 -3.75 -3.66
+# Dev objf:   -10.79 -6.07 -5.76 -5.67 -5.64 -5.62 -5.62 -5.66 -5.69 -5.68
 
-# %WER 39.27 [ 24406 / 62144, 2835 ins, 6472 del, 15099 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev/wer_9_1.0
-# %WER 37.67 [ 23408 / 62144, 3238 ins, 5998 del, 14172 sub ] exp/swahili/chain/tdnn_lstm1a_sp_ld5/decode_dev_rnnlm_1a/wer_10_0.0
+# %WER 50.60 [ 29937 / 59166, 1664 ins, 11851 del, 16422 sub ] exp/chain/tdnn_lstm1a_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+# %WER 37.32 [ 23193 / 62144, 3036 ins, 6108 del, 14049 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev_rnnlm_rescore/wer_9_0.5
 
 # Begin configuration section.
 
@@ -22,41 +22,48 @@ lstm_rpd=128
 lstm_nrpd=128
 stage=-10
 train_stage=-10
+epochs=40
 
 # variables for lattice rescoring
 run_rescore=true
-decode_dir_suffix=rnnlm_1a
+decode_dir_suffix=rnnlm
 ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order
               # if it's set, it merges histories in the lattice if they share
               # the same ngram history and this prevents the lattice from 
               # exploding exponentially
 pruned_rescore=true
 
+ac_model_dir=exp/chain/tdnn_lstm1a_sp
+decode_sets="dev analysis1_segmented_reseg"
+dir=exp/rnnlm_lstm_1a
+text_dir=data/rnnlm/text
+train_text=data/lm/train.txt
+dev_text=data/lm/dev.txt
+bitext=data/bitext/text.txt
+lang=data/lang_combined_chain
+tree_dir=exp/chain/tree_sp
+
 . ./cmd.sh
 . ./utils/parse_options.sh
 
-ac_model_dir=exp/chain/tdnn_lstm1a_sp_ld5
-dir=exp/rnnlm_lstm_1a
-text=data/train/text
-lexicon=data/local/dict_nosp/lexiconp.txt
-text_dir=data/rnnlm/text_nosp_1e
+
 mkdir -p $dir/config
 set -e
 
-for f in $text $lexicon; do
+for f in ${train_text} ${dev_text} $bitext; do
   [ ! -f $f ] && \
-    echo "$0: expected file $f to exist; search for local/wsj_extend_dict.sh in run.sh" && exit 1
+    echo "$0: expected file $f to exist; look at stage 12 in run.sh" && exit 1
 done
 
 if [ $stage -le 0 ]; then
   mkdir -p $text_dir
-  echo -n >$text_dir/dev.txt
-  # hold out one in every 50 lines as dev data.
-  cat $text | cut -d ' ' -f2- | awk -v text_dir=$text_dir '{if(NR%50 == 0) { print >text_dir"/dev.txt"; } else {print;}}' >$text_dir/train.txt
+  cat $train_text > $text_dir/train.txt
+  cat $dev_text > $text_dir/dev.txt
+  cat $bitext > $text_dir/bitext.txt
 fi
 
 if [ $stage -le 1 ]; then
-  cp data/lang/words.txt $dir/config/
+  cp $lang/words.txt $dir/config/
   n=`cat $dir/config/words.txt | wc -l`
   echo "<brk> $n" >> $dir/config/words.txt
 
@@ -65,7 +72,8 @@ if [ $stage -le 1 ]; then
   echo "<unk>" >$dir/config/oov.txt
 
   cat > $dir/config/data_weights.txt <<EOF
-train   10   1.0
+train   1   1.0
+bitext  1   1.0
 EOF
 
   rnnlm/get_unigram_probs.py --vocab-file=$dir/config/words.txt \
@@ -96,28 +104,40 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding_l2 0.01 \
-                  --stage $train_stage --num-epochs 10 --cmd "$train_cmd" $dir
+  rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 --embedding-l2 0.001 \
+                  --stage $train_stage --num-epochs $epochs --cmd "$train_cmd" $dir
 fi
 
-LM=test
+LM=combined_chain
 if [ $stage -le 4 ] && $run_rescore; then
   echo "$0: Perform lattice-rescoring on $ac_model_dir"
   pruned=
   if $pruned_rescore; then
     pruned=_pruned
   fi
-  for decode_set in dev; do
-    decode_dir=${ac_model_dir}/decode_${decode_set}
-
-    # Lattice rescoring
-    rnnlm/lmrescore$pruned.sh \
-      --cmd "$decode_cmd --mem 4G" \
-      --weight 0.5 --max-ngram-order $ngram_order \
-      data/lang_$LM $dir \
-      data/${decode_set}_hires ${decode_dir} \
-      ${decode_dir}_${decode_dir_suffix}
+  rm $dir/.error 2>/dev/null || true
+  for decode_set in ${decode_sets}; do
+    (
+      decode_dir=${ac_model_dir}/decode_${decode_set}
+      skip_scoring=false
+      if [ ${decode_set} != "dev" ]; then skip_scoring=true; fi
+
+      # Lattice rescoring
+      rnnlm/lmrescore$pruned.sh \
+        --cmd "$decode_cmd" \
+        --weight 0.5 --max-ngram-order $ngram_order --max-arcs 20000 \
+        --skip-scoring ${skip_scoring} \
+        data/lang_$LM $dir data/${decode_set}_hires \
+        ${decode_dir} ${decode_dir}_${decode_dir_suffix}_rescore || exit 1
+
+      if [ ${decode_set} != "dev" ]; then
+        local/postprocess_test.sh ${decode_set} ${tree_dir}/graph_combined \
+          ${decode_dir}_${decode_dir_suffix}_rescore
+      fi
+    ) || touch $dir/.error &
   done
 fi
+wait
+[ -f $dir/.error ] && echo "$0: there was a problem while rescoring" && exit 1
 
 exit 0
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
index 6aa541c624d..1115da2652e 100755
--- a/egs/material/s5/local/score_wer_segments.sh
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -6,6 +6,7 @@
 # begin configuration section.
 cmd=run.pl
 stage=0
+stats=true
 #end configuration section.
 
 echo "$0 $@"  # Print the command line for logging
@@ -52,3 +53,24 @@ if [ $stage -le 0 ]; then
   cat $dir/wer
 fi
 
+if [ $stage -le 1 ]; then
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+         ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/hyp.txt \
+         '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+  fi
+fi
diff --git a/egs/material/s5/local/wer_output_filter b/egs/material/s5/local/wer_output_filter
index aceeeec41b4..5195bb9150d 100755
--- a/egs/material/s5/local/wer_output_filter
+++ b/egs/material/s5/local/wer_output_filter
@@ -12,7 +12,7 @@ while (<>) {
   @F = split " ";
   print $F[0] . " "; 
   foreach $s (@F[1..$#F]) {
-    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL")) {
+    if (($s =~ /\[.*\]/) || ($s =~ /\<.*\>/) || ($s =~ "!SIL") || ($s =~ /--|\.|\?|\(\(\)\)|%incomplete/)) {
       print "";
     } else {
       print "$s"
diff --git a/egs/material/s5/rnnlm b/egs/material/s5/rnnlm
new file mode 120000
index 00000000000..72302c5e570
--- /dev/null
+++ b/egs/material/s5/rnnlm
@@ -0,0 +1 @@
+../../../scripts/rnnlm
\ No newline at end of file
diff --git a/egs/material/s5/run.sh b/egs/material/s5/run.sh
index 41901ee237c..81a0da6f42b 100755
--- a/egs/material/s5/run.sh
+++ b/egs/material/s5/run.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
-# Copyright (c) 2017, Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+
+# Copyright 2017-2018  Johns Hopkins University (Jan "Yenda" Trmal<jtrmal@gmail.com>)
+#           2017-2018  Johns Hopkins University (author: Daniel Povey)
+#                2018  Yiming Wang
 # License: Apache 2.0
 
 # Begin configuration section.
@@ -153,3 +156,94 @@ if [ $stage -le 11 ]; then
       exp/tri3/graph data/$test exp/tri3/decode_$test
   done
 fi
+
+mkdir -p data/bitext
+srctext_bitext=data/bitext/text
+if [ $stage -le 12 ]; then
+  # Read the Swahili/Tagalog part of the bitext as $srctext_bitext and
+  # preprocess the text
+  cat $bitext | awk -F"\t" '{print $2;}' > $srctext_bitext
+
+  local/preprocess_bitext.sh --language $language \
+    --srctext-bitext ${srctext_bitext} > ${srctext_bitext}.txt
+
+  # Combine two sources of text
+  awk '{print $1}' < $bitext > ${srctext_bitext}.header
+  paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed
+fi
+
+# The next 3 stages are to train g2p from the existing lexicon,
+# apply g2p to expand the lexicon using oov words from bitext data
+# as in ${dict_root}_nosp.
+g2p_workdir=data/local/g2p_phonetisarus
+if [ $stage -le 13 ]; then
+  echo 'Gathering missing words...'
+  mkdir -p ${g2p_workdir}
+  cat ${srctext_bitext}.txt | \
+    local/count_oovs.pl data/local/dict_nosp/lexicon.txt | \
+    awk '{for(i=4; i<NF; i++) printf "%s",$i OFS; if(NF) printf "%s",$NF; printf ORS}' | \
+    perl -ape 's/\s/\n/g;' | \
+    sort | uniq > ${g2p_workdir}/missing.txt
+  cat ${g2p_workdir}/missing.txt | \
+    grep "^[a-z]*$"  > ${g2p_workdir}/missing_onlywords.txt
+fi
+
+if [ $stage -le 14 ]; then
+  local/g2p/train_g2p.sh --stage 0 --silence-phones \
+    "data/local/dict/silence_phones.txt" data/local/dict_nosp exp/g2p || touch exp/g2p/.error
+fi
+
+dict_root=data/local/dict_combined
+if [ $stage -le 15 ]; then
+  if [ -f exp/g2p/.error ]; then
+    rm exp/g2p/.error || true
+    echo "Fail to train the G2P model." && exit 1;
+  fi
+  mkdir -p ${dict_root}_nosp
+  rm ${dict_root}_nosp/lexiconp.txt 2>/dev/null || true
+  cp data/local/dict_nosp/{phones,oov,nonsilence_phones,silence_phones,optional_silence}.txt ${dict_root}_nosp
+  local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst ${g2p_workdir} \
+  data/local/dict_nosp/lexicon.txt ${dict_root}_nosp/lexicon.txt || exit 1;
+
+  utils/validate_dict_dir.pl ${dict_root}_nosp
+fi
+
+lang_root=data/lang_combined
+if [ $stage -le 16 ]; then
+  utils/prepare_lang.sh ${dict_root}_nosp "<unk>" data/local/lang_combined_nosp ${lang_root}_nosp
+  utils/validate_lang.pl ${lang_root}_nosp
+fi
+
+# prepare the new LM with bitext data and the new lexicon,
+# as in the new test lang directory ${lang_root}_nosp_test
+if [ $stage -le 17 ]; then
+  # Combine two sources of text
+  awk '{print $1}' < $bitext > ${srctext_bitext}.header
+  paste ${srctext_bitext}.header ${srctext_bitext}.txt > ${srctext_bitext}.processed
+  mkdir -p data/lm_combined
+  # train a new lm located in data/lm_combine
+  cat data/lm/train_text ${srctext_bitext}.processed > data/lm_combined/train_text
+  cat data/lm/dev_text > data/lm_combined/dev_text
+  local/train_lms_srilm.sh --oov-symbol "<unk>" --words-file ${lang_root}_nosp/words.txt \
+    --train-text data/lm_combined/train_text --dev-text data/lm_combined/dev_text \
+    data data/lm_combined
+  utils/format_lm.sh ${lang_root}_nosp data/lm_combined/lm.gz \
+    ${dict_root}_nosp/lexiconp.txt ${lang_root}_nosp_test
+  utils/validate_lang.pl ${lang_root}_nosp_test
+fi
+
+# Now we compute the pronunciation and silence probabilities from training data,
+# and re-create the lang directory ${lang_root}_test.
+if [ $stage -le 18 ]; then
+  steps/get_prons.sh --cmd "$train_cmd" data/train ${lang_root}_nosp_test exp/tri3
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    ${dict_root}_nosp \
+    exp/tri3/pron_counts_nowb.txt exp/tri3/sil_counts_nowb.txt \
+    exp/tri3/pron_bigram_counts_nowb.txt ${dict_root}
+  utils/prepare_lang.sh ${dict_root} "<unk>" data/local/lang_combined ${lang_root}
+
+  utils/format_lm.sh ${lang_root} data/lm_combined/lm.gz \
+    ${dict_root}/lexiconp.txt ${lang_root}_test
+fi
+
+exit 0;

From fcf5136ecd6e3f1980f2a5e17b2ceac0245fe092 Mon Sep 17 00:00:00 2001
From: freewym <freewym@gmail.com>
Date: Thu, 1 Mar 2018 16:32:53 -0500
Subject: [PATCH 180/184] added support to decode test dev/eval1

---
 egs/material/s5/conf/lang/swahili.conf        |  4 +-
 egs/material/s5/conf/lang/tagalog.conf        |  4 +-
 egs/material/s5/local/chain/decode_test.sh    | 18 ++---
 .../s5/local/chain/tuning/run_tdnn_1a.sh      |  7 +-
 egs/material/s5/local/preprocess_test.sh      | 70 +++++++++++--------
 egs/material/s5/local/rnnlm/run_tdnn_lstm.sh  | 12 +++-
 egs/material/s5/local/score_wer_segments.sh   | 46 +++++++-----
 7 files changed, 102 insertions(+), 59 deletions(-)

diff --git a/egs/material/s5/conf/lang/swahili.conf b/egs/material/s5/conf/lang/swahili.conf
index f8725db4687..17f097846d0 100755
--- a/egs/material/s5/conf/lang/swahili.conf
+++ b/egs/material/s5/conf/lang/swahili.conf
@@ -1,7 +1,9 @@
 # speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A-BUILD_v1.0/
 # test audio files to decode
-audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/ANALYSIS1/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/DEV/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1A/EVAL1/audio/
 # bitext file location
 bitext=$corpus/bitext/MATERIAL_BASE-1A-BUILD_bitext.txt
 
diff --git a/egs/material/s5/conf/lang/tagalog.conf b/egs/material/s5/conf/lang/tagalog.conf
index bca63a1ba3e..4195ffccd6f 100644
--- a/egs/material/s5/conf/lang/tagalog.conf
+++ b/egs/material/s5/conf/lang/tagalog.conf
@@ -1,7 +1,9 @@
 # speech corpora files location
 corpus=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B-BUILD_v1.0/
 # test audio files to decode
-audio_path=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_analysis1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_dev=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/ANALYSIS1/audio/
+audio_path_eval1=/export/corpora5/MATERIAL/IARPA_MATERIAL_BASE-1B/EVAL1/audio/
 # bitext file location
 bitext=$corpus/bitext/MATERIAL_BASE-1B-BUILD_bitext.txt
 
diff --git a/egs/material/s5/local/chain/decode_test.sh b/egs/material/s5/local/chain/decode_test.sh
index e3e8bc7cdfd..d6c0af74b16 100755
--- a/egs/material/s5/local/chain/decode_test.sh
+++ b/egs/material/s5/local/chain/decode_test.sh
@@ -10,15 +10,15 @@ set -euo pipefail
 
 language=swahili
 stage=0
-datadev="data/analysis1"
-dir=exp/chain/tdnn_lstm1a_sp
+datadev="data/analysis1 data/test_dev data/eval1"
+dir=exp/chain/tdnn1a_sp
 lang=data/lang_chain
 tree_dir=exp/chain/tree_sp
 cmd=queue.pl
 
 # training options
 chunk_width=140,100,160
-chunk_left_context=40
+chunk_left_context=0
 chunk_right_context=0
 
 # ivector options
@@ -66,7 +66,7 @@ if [ $stage -le 1 ]; then
   done
 fi
 
-if [ $stage -le 3 ]; then
+if [ $stage -le 2 ]; then
   # extract iVectors for the test data, in this case we don't need the speed
   # perturbation (sp).
   for datadir in $datadev; do
@@ -82,12 +82,12 @@ fi
 frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
 rm $dir/.error 2>/dev/null || true
 
-if [ $stage -le 4 ]; then
+if [ $stage -le 3 ]; then
   # do the 1st pass decoding
   for datadir in $datadev; do
     (
       data=$(basename $datadir)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      nspk=$(wc -l <data/${data}_segmented_hires/spk2utt)
       decode_dir=${dir}/decode_${data}_segmented
       steps/nnet3/decode.sh \
         --acwt 1.0 --post-decode-acwt 10.0 \
@@ -111,7 +111,7 @@ fi
 wait
 [ -f $dir/.error ] && echo "$0: there was a problem while decoding" && exit 1
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 4 ]; then
   # re-segement data based on 1st-pass decoding
   segmentation_opts="--silence-proportion 0.2 --max-segment-length 15 --frame-shift 0.03"
   for datadir in $datadev; do
@@ -148,12 +148,12 @@ if [ $stage -le 5 ]; then
   done
 fi
 
-if [ $stage -le 8 ]; then
+if [ $stage -le 5 ]; then
   # 2nd-pass decoding on the resegmented data
   for datadir in $datadev; do
     (
       data=$(basename $datadir)
-      nspk=$(wc -l <data/${data}_hires/spk2utt)
+      nspk=$(wc -l <data/${data}_segmented_reseg_hires/spk2utt)
       decode_dir=${dir}/decode_${data}_segmented_reseg
       steps/nnet3/decode.sh \
         --acwt 1.0 --post-decode-acwt 10.0 \
diff --git a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
index 7d59bd08dcf..49e8e4da4a0 100755
--- a/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
+++ b/egs/material/s5/local/chain/tuning/run_tdnn_1a.sh
@@ -5,12 +5,17 @@
 
 # 1a is trying an architecture with factored parameter matrices with dropout.
 
-# [for swahili]
 # cat exp/chain/tdnn1a_sp/decode_dev/scoring_kaldi/best_wer
+# [for swahili]
 # %WER 38.65 [ 24021 / 62144, 3044 ins, 6378 del, 14599 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.5
+# [for tagalog]
+# %WER 46.53 [ 29955 / 64382, 3425 ins, 9485 del, 17045 sub ] exp/chain/tdnn1a_sp/decode_dev/wer_9_0.0
 
 # steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp
+# [for swahili]
 # exp/chain/tdnn1a_sp: num-iters=99 nj=2..12 num-params=12.2M dim=40+100->1792 xent:train/valid[65,98,final]=(-1.93,-1.66,-1.68/-2.05,-1.84,-1.83) logprob:train/valid[65,98,final]=(-0.199,-0.166,-0.167/-0.225,-0.208,-0.206)
+# [for tagalog]
+# exp/chain/tdnn1a_sp: num-iters=96 nj=2..12 num-params=12.3M dim=40+100->1952 combine=-0.165->-0.165 (over 2) xent:train/valid[63,95,final]=(-1.89,-1.66,-1.65/-2.06,-1.89,-1.89) logprob:train/valid[63,95,final]=(-0.186,-0.158,-0.157/-0.231,-0.219,-0.218)
 
 set -e -o pipefail
 
diff --git a/egs/material/s5/local/preprocess_test.sh b/egs/material/s5/local/preprocess_test.sh
index 9e6b9ef08f2..7b6fb863332 100755
--- a/egs/material/s5/local/preprocess_test.sh
+++ b/egs/material/s5/local/preprocess_test.sh
@@ -15,38 +15,50 @@ mkdir -p $datadev
 
 # 1. create the reference transcript $datadev/reftext
 
-ls -d $audio_path/transcription/* > "list.tmp"
-rm -rf {zero,brac,all}.tmp
-rm -rf all.tmp.sort
-
-while read line; do
-  h=$(head -1 "$line")
-  if [[ $h == "0"* ]]
-  then                                                                     
-    # starts with 0.000"
-    echo $line | while read ref; do
-      s=${ref##*/}
-      awk '{l[NR] = $0} END {for (i=1; i<=NR-1; i++) print l[i]}' "$ref" | \
-        cut -f1,2 --complement | tr '\n' ' ' | \
-        awk '{$0="'${s%.transcription.txt}' " $0}1' >> zero.tmp;
-    done
-  else
-    # starts with [0.000]"
-    echo $line | while read ref; do
-      s=${ref##*/}
-      awk 'NR%2==0' "$ref" | tr '\n' ' ' | \
-        awk '{$0="'${s%.transcription.txt}' " $0}1'>> brac.tmp;
-    done
-  fi
-done < "list.tmp"
-
-cat zero.tmp brac.tmp > all.tmp
-mv all.tmp $datadev/reftext
-rm -rf {zero,brac,list}.tmp
+if [ $(basename $datadev) == 'analysis1' ]; then
+  ls -d ${audio_path_analysis1}/transcription/* > "list.tmp"
+  rm -rf {zero,brac,all}.tmp
+  rm -rf all.tmp.sort
+
+  while read line; do
+    h=$(head -1 "$line")
+    if [[ $h == "0"* ]]
+    then                                                                     
+      # starts with 0.000"
+      echo $line | while read ref; do
+        s=${ref##*/}
+        awk '{l[NR] = $0} END {for (i=1; i<=NR-1; i++) print l[i]}' "$ref" | \
+          cut -f1,2 --complement | tr '\n' ' ' | \
+          awk '{$0="'${s%.transcription.txt}' " $0}1' >> zero.tmp;
+      done
+    else
+      # starts with [0.000]"
+      echo $line | while read ref; do
+        s=${ref##*/}
+        awk 'NR%2==0' "$ref" | tr '\n' ' ' | \
+          awk '{$0="'${s%.transcription.txt}' " $0}1'>> brac.tmp;
+      done
+    fi
+  done < "list.tmp"
+
+  cat zero.tmp brac.tmp > all.tmp
+  mv all.tmp $datadev/reftext
+  rm -rf {zero,brac,list}.tmp
+fi
 
 # 2. create wav.scp, utt2spk, spk2utt files
 
-find $audio_path/src -name "*.wav" \
+wav_path=
+if [ $(basename $datadev) == 'analysis1' ]; then
+  wav_path=${audio_path_analysis1}/src
+elif [ $(basename $datadev) == 'test_dev' ]; then
+  wav_path=${audio_path_dev}/src
+elif [ $(basename $datadev) == 'eval1' ]; then
+  wav_path=${audio_path_eval1}/src
+fi
+[ -z ${wav_path} ] && echo "$0: test data should be either analysis1, test_dev, or eval1." && exit 1
+
+find ${wav_path} -name "*.wav" \
   | while read file; do id=$(basename $file | awk '{gsub(".wav","");print}'); \
   echo "$id sox $file -r 8000 -b 16 -c 1 -t wav - |"; done > \
   $datadev/wav.scp
diff --git a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
index 619c3947676..cd4ab766c8b 100755
--- a/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
+++ b/egs/material/s5/local/rnnlm/run_tdnn_lstm.sh
@@ -15,6 +15,14 @@
 # %WER 50.60 [ 29937 / 59166, 1664 ins, 11851 del, 16422 sub ] exp/chain/tdnn_lstm1a_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
 # %WER 37.32 [ 23193 / 62144, 3036 ins, 6108 del, 14049 sub ] exp/chain/tdnn_lstm1a_sp/decode_dev_rnnlm_rescore/wer_9_0.5
 
+# [for tagalog]
+# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 4, linking it to final iteration.
+# rnnlm/train_rnnlm.sh: train/dev perplexity was 73.6 / 106.2.
+# Train objf: -5.55 -4.83 -4.58 -4.41 -4.28 -4.17 -4.06 -3.96 -3.86
+# Dev objf:   -10.54 -4.87 -4.72 -4.67 -4.67 -4.69 -4.71 -4.74 -4.78
+# %WER 56.53 [ 49383 / 87362, 2899 ins, 25688 del, 20796 sub ] exp/chain/tdnn1a_sp/decode_analysis1_segmented_reseg_rnnlm_rescore
+# %WER 43.54 [ 28033 / 64382, 3471 ins, 9061 del, 15501 sub ] exp/chain/tdnn1a_sp/decode_dev_rnnlm_rescore/wer_9_0.0
+
 # Begin configuration section.
 
 embedding_dim=512
@@ -33,8 +41,8 @@ ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-orde
               # exploding exponentially
 pruned_rescore=true
 
-ac_model_dir=exp/chain/tdnn_lstm1a_sp
-decode_sets="dev analysis1_segmented_reseg"
+ac_model_dir=exp/chain/tdnn1a_sp
+decode_sets="dev analysis1_segmented_reseg test_dev_segmented_reseg eval1_segmented_reseg"
 dir=exp/rnnlm_lstm_1a
 text_dir=data/rnnlm/text
 train_text=data/lm/train.txt
diff --git a/egs/material/s5/local/score_wer_segments.sh b/egs/material/s5/local/score_wer_segments.sh
index 1115da2652e..85ea45916f6 100755
--- a/egs/material/s5/local/score_wer_segments.sh
+++ b/egs/material/s5/local/score_wer_segments.sh
@@ -25,7 +25,12 @@ hyp_filtering_cmd="cat"
 
 
 mkdir -p $dir/scoring_kaldi
-cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ -f $data/reftext ]; then
+  cat $data/reftext | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+else
+  echo "$0: No reference text to compute WER" 
+fi
+
 if [ $stage -le 0 ]; then
 
   mkdir -p $dir/scoring_kaldi/log
@@ -34,23 +39,32 @@ if [ $stage -le 0 ]; then
   rm -rf tmpconcat
   awk '{a[$1]=a[$1]" "$5;}END{for(i in a)print i""a[i];}' \
     $dir/score_10/ctm_out > tmpconcat
-  awk -F" " '{print $1}' $data/reftext > tmpreforder
-  rm -rf $dir/score_10/ctm_out.concat
-  while read LINE; do                                                             
-    grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat"
-  done < "tmpreforder"
-  rm -rf tmpconcat
-  rm -rf tmpreforder
-  $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
-    $dir/scoring_kaldi/hyp.txt || exit 1;
-  #end building hypothesis hyp.txt
+  if [ -f $data/reftext ]; then
+    awk -F" " '{print $1}' $data/reftext > tmpreforder
+    rm -rf $dir/score_10/ctm_out.concat
+    while read LINE; do
+      grep "$LINE" "tmpconcat" >> "$dir/score_10/ctm_out.concat"
+    done < "tmpreforder"
+    rm -rf tmpconcat
+    rm -rf tmpreforder
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    #end building hypothesis hyp.txt
     
-  $cmd $dir/scoring_kaldi/log/score.hyp.log \
-    cat $dir/scoring_kaldi/hyp.txt \| \
-    compute-wer --text --mode=present \
-    ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
+    $cmd $dir/scoring_kaldi/log/score.hyp.log \
+      cat $dir/scoring_kaldi/hyp.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark:- ">&" $dir/wer || exit 1;
 
-  cat $dir/wer
+    cat $dir/wer
+  else
+    cat tmpconcat > "$dir/score_10/ctm_out.concat"
+    rm -rf tmpconcat
+    $hyp_filtering_cmd $dir/score_10/ctm_out.concat > \
+      $dir/scoring_kaldi/hyp.txt || exit 1;
+    exit 0;
+    #end building hypothesis hyp.txt
+  fi
 fi
 
 if [ $stage -le 1 ]; then

From 3fd44ba15c93ea8c2bfa5896a891e4eb2b930ce1 Mon Sep 17 00:00:00 2001
From: wintrode <jcwintr@cs.jhu.edu>
Date: Fri, 16 Mar 2018 15:55:24 -0400
Subject: [PATCH 181/184] Script to package tdnn-lstm decoder models

---
 egs/material/s5/local/models/make_mpack.sh    | 93 +++++++++++++++++++
 .../s5/local/models/rewrite_config.py         | 56 +++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 egs/material/s5/local/models/make_mpack.sh
 create mode 100644 egs/material/s5/local/models/rewrite_config.py

diff --git a/egs/material/s5/local/models/make_mpack.sh b/egs/material/s5/local/models/make_mpack.sh
new file mode 100644
index 00000000000..fc4d06c3421
--- /dev/null
+++ b/egs/material/s5/local/models/make_mpack.sh
@@ -0,0 +1,93 @@
+#!/bin/bash
+# Copyright (c) 2018, Johns Hopkins University (Jonathan Wintrode <jonathan.wintrode@gmail.com>)
+# License: Apache 2.0
+
+# Model packaging for tdnn-lstm recipe decoder
+#
+# 
+
+# Begin configuration section.
+# End configuration section
+
+lmwt=10
+prefix=
+feature_conf=conf/mfcc_hires.conf
+ivector_conf=exp/nnet3/ivectors_train_sp_hires/conf/ivector_extractor.conf
+iter=final
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+graphdir=$1
+modeldir=$2
+lpack=$3
+
+# move all files to a temporary folder, the zip
+if [ -z $TMP ]; then
+    TMP=/tmp
+fi
+
+lpdir=$TMP/kaldi-lp-`uuidgen`
+
+if [[ ! -d $lpdir ]]; then
+    mkdir -p $lpdir
+fi
+
+# Copy all config files to the conf directory.
+# Kaldi config files are usually made with paths
+# relative to the experiment directory and often
+# point to other config files
+# We copy all nested config files and replace
+# the relative paths with either the token
+# __modeldir__ which is the root of the zip archive
+# or __confdir__ which is __modeldir__/conf
+
+mkdir -p $lpdir/conf
+
+echo "# Parsing feature config files..."
+python rewrite_config.py $feature_conf $lpdir/conf $lpdir || exit 1
+
+echo "# Parsing ivector config files..."
+python rewrite_config.py $ivector_conf $lpdir/conf $lpdir || exit 1
+
+echo "# Copying nnet model $iter.model"
+cp $modeldir/$iter.mdl $lpdir || exit 1
+
+echo "# Copying decoding graph HCLG.fst, words.txt"
+echo mkdir $lpdir/graph
+mkdir $lpdir/graph
+
+# Check for files in the graph directory to be sure
+graphfiles=(HCLG.fst words.txt phones.txt disambig_tid.int)
+phonefiles=(word_boundary.int word_boundary.txt align_lexicon.txt \
+            align_lexicon.int disambig.int disambig.txt silence.csl)
+
+for file in ${graphfiles[@]} ; do
+    if [ ! -f $graphdir/$file ]; then
+        echo "# Missing graph file $graphdir/$file"
+        exit 1
+    fi
+    echo "cp $graphdir/$file $lpdir/graph"
+    cp $graphdir/$file $lpdir/graph
+done
+
+# if a dev set is decoded, we could extract best LMWT
+#lexicon=`grep ^lexicon_file local.conf| sed 's/lexicon_file=//'`
+#if [[ -f $lexicon ]]; then
+#    cp $lexicon $lpdir/lexicon.txt
+#else
+#    echo "Warning: Unable to find lexicon file $lexicon"
+#fi
+#echo optlmwt=$lmwt >> $lpdir/local.conf
+#touch $lpdir/conf/decode.config
+
+# zip the files to the destination $lpack
+
+fullpath=`readlink -f $lpack`
+pushd $lpdir
+zip -r $fullpath *
+popd
+rm -rf $lpdir
+
+exit 0
diff --git a/egs/material/s5/local/models/rewrite_config.py b/egs/material/s5/local/models/rewrite_config.py
new file mode 100644
index 00000000000..e5dcd6c4767
--- /dev/null
+++ b/egs/material/s5/local/models/rewrite_config.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright (c) 2018, Johns Hopkins University (Jonathan Wintrode <jonathan.wintrode@gmail.com>)
+# License: Apache 2.0
+
+import sys
+import os
+from shutil import copyfile
+
+# recursively rewrite a config file an its dependencies
+def rewrite(file, confdir, fdir) :
+    name = os.path.basename(file)    
+    out = open(confdir+ "/" + name, 'w')
+    f=open(file, 'r')
+    
+    for l in f :
+        if (l.startswith('#')) :
+            out.write(l);
+            continue
+
+        l = l.strip()
+        x = l.split("=",1)
+        if (len(x) != 2) :
+            sys.stderr.write("Invalid config file, %s: expecting key=val\n"%(file));
+            return False
+        (flag, val)=x
+
+                    
+        if (os.path.isfile(val)) :
+            name = os.path.basename(val)
+            if ( val.endswith(".conf")) :
+                out.write("%s=__confdir__/%s\n" % (flag, name))
+                rewrite(val, confdir, fdir)
+            else :
+                if (not os.path.isfile(fdir + "/" + name)) :
+                    copyfile(val, fdir + "/" + name)
+                out.write("%s=__modeldir__/%s\n" % (flag, name))
+        else :
+            out.write(l + "\n")
+
+    f.close()
+    out.close()
+
+if (len(sys.argv) < 3) :
+    sys.err.write("Usage: rewrite_config.py config-file output-dir\n")
+    sys.exit(1)
+
+    
+(config, confdir, filedir) = sys.argv[1:]
+
+name = os.path.basename(config)
+
+print("# rewriting %s\n" % (config))
+
+rewrite(config, confdir, filedir)
+
+

From ea927e6e96886e7d4349b1e180c0a83950b126a5 Mon Sep 17 00:00:00 2001
From: wintrode <jcwintr@cs.jhu.edu>
Date: Fri, 16 Mar 2018 16:38:30 -0400
Subject: [PATCH 182/184] Incorporated cleaning script for decoder so decoder
 is binary-only

---
 egs/material/s5/local/models/make_mpack.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/egs/material/s5/local/models/make_mpack.sh b/egs/material/s5/local/models/make_mpack.sh
index fc4d06c3421..0008ad98737 100644
--- a/egs/material/s5/local/models/make_mpack.sh
+++ b/egs/material/s5/local/models/make_mpack.sh
@@ -82,6 +82,10 @@ done
 #echo optlmwt=$lmwt >> $lpdir/local.conf
 #touch $lpdir/conf/decode.config
 
+mkdir $lpdir/scripts
+
+cp local/wer_output_filter $lpdir/scripts
+
 # zip the files to the destination $lpack
 
 fullpath=`readlink -f $lpack`

From 4a828b18422d1571f3f5423ec52dbaa606992bca Mon Sep 17 00:00:00 2001
From: wintrode <jcwintr@cs.jhu.edu>
Date: Fri, 16 Mar 2018 16:56:32 -0400
Subject: [PATCH 183/184] Moved script paths in make_mpack

---
 egs/material/s5/local/models/make_mpack.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/material/s5/local/models/make_mpack.sh b/egs/material/s5/local/models/make_mpack.sh
index 0008ad98737..f4b4b1c5070 100644
--- a/egs/material/s5/local/models/make_mpack.sh
+++ b/egs/material/s5/local/models/make_mpack.sh
@@ -46,10 +46,10 @@ fi
 mkdir -p $lpdir/conf
 
 echo "# Parsing feature config files..."
-python rewrite_config.py $feature_conf $lpdir/conf $lpdir || exit 1
+python local/models/rewrite_config.py $feature_conf $lpdir/conf $lpdir || exit 1
 
 echo "# Parsing ivector config files..."
-python rewrite_config.py $ivector_conf $lpdir/conf $lpdir || exit 1
+python local/models/rewrite_config.py $ivector_conf $lpdir/conf $lpdir || exit 1
 
 echo "# Copying nnet model $iter.model"
 cp $modeldir/$iter.mdl $lpdir || exit 1

From 4bc1fa48870a4a0a735a36291d13e76e93d7a782 Mon Sep 17 00:00:00 2001
From: wintrode <jcwintr@cs.jhu.edu>
Date: Fri, 16 Mar 2018 16:59:01 -0400
Subject: [PATCH 184/184] Added associated decoding script

---
 egs/material/s5/local/models/decode.sh | 101 +++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100755 egs/material/s5/local/models/decode.sh

diff --git a/egs/material/s5/local/models/decode.sh b/egs/material/s5/local/models/decode.sh
new file mode 100755
index 00000000000..7e7ee140ef4
--- /dev/null
+++ b/egs/material/s5/local/models/decode.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# begin configuration section.
+decode_mbr=false
+beam=6
+word_ins_penalty=1.0
+min_lmwt=7
+max_lmwt=17
+filter=wer_output_filter
+# end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+modeldir=$1
+wav=$2
+segments=$3
+dir=$4
+
+mkdir -p $dir
+
+sed "s|__modeldir__|${modeldir}|" $modeldir/conf/mfcc_hires.conf | \
+    sed "s|__confdir__|${modeldir}/conf|" > $dir/mfcc.conf
+sed "s|__modeldir__|${modeldir}|" $modeldir/conf/ivector_extractor.conf | \
+    sed "s|__confdir__|${modeldir}/conf|" > $dir/ivector_extractor.conf
+
+awk '{ print $1, $2 }'  < $segments > $dir/utt2spk
+utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt
+
+extract-segments scp,p:$wav $segments ark:- | \
+    compute-mfcc-feats --verbose=2 --config=$dir/mfcc.conf ark:- ark:- | \
+    copy-feats --compress=true ark:- ark,scp:$dir/feats.ark,$dir/feats.scp
+
+compute-cmvn-stats --spk2utt=ark:$dir/spk2utt scp:$dir/feats.scp \
+                   ark,scp:$dir/cmvn.ark,$dir/cmvn.scp
+
+echo ivector-extract-online2 --config=$dir/ivector_extractor.conf \
+                        ark:$dir/spk2utt \
+                        scp:$dir/feats.scp ark:- \| \
+    copy-feats --compress=true ark:- \
+               ark,scp:$dir/ivector_online.ark,$dir/ivector_online.scp
+
+ivector-extract-online2 --config=$dir/ivector_extractor.conf \
+                        ark:$dir/spk2utt \
+                        scp:$dir/feats.scp ark:- | \
+    copy-feats --compress=true ark:- \
+               ark,scp:$dir/ivector_online.ark,$dir/ivector_online.scp
+
+nnet3-latgen-faster-parallel --num-threads=4 \
+                             --online-ivectors=scp:$dir/ivector_online.scp \
+                             --online-ivector-period=10 --frame-subsampling-factor=3 \
+                             --frames-per-chunk=140 --extra-left-context=40 \
+                             --extra-right-context=0 --extra-left-context-initial=0 \
+                             --extra-right-context-final=0 --minimize=false \
+                             --max-active=7000 --min-active=200 --beam=15.0 --lattice-beam=8.0 \
+                             --acoustic-scale=1.0 --allow-partial=true \
+                             --word-symbol-table=$modeldir/graph/words.txt \
+                             $modeldir/final.mdl \
+                             $modeldir/graph/HCLG.fst \
+                             "ark,s,cs:apply-cmvn --norm-means=false --norm-vars=false --utt2spk=ark:$dir/utt2spk scp:$dir/cmvn.scp scp:$dir/feats.scp ark:- |" \
+                             "ark:|lattice-scale --acoustic-scale=10.0 ark:- ark:- | gzip -c > $dir/lat.gz"
+
+if $decode_mbr ; then
+    echo "$0: decoding with MBR, word insertion penalty=$word_ins_penalty"
+else
+    echo "$0: decoding with word insertion penalty=$word_ins_penalty"
+fi
+
+wip=$word_ins_penalty
+mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+symtab=$modeldir/graph/words.txt
+hyp_filtering_cmd=$modeldir/scripts/$filter
+
+for LMWT in `seq $min_lmwt $max_lmwt` ; do
+    #    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+    if $decode_mbr ; then
+        lattice-scale --inv-acoustic-scale=$LMWT "ark:gunzip -c $dir/lat.gz|" ark:- | \
+                      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- | \
+                      lattice-prune --beam=$beam ark:- ark:- | \
+                      lattice-mbr-decode  --word-symbol-table=$symtab \
+                      ark:- ark,t:- | \
+                      utils/int2sym.pl -f 2- $symtab | \
+                      $hyp_filtering_cmd > $dir/transcript.txt || exit 1;
+        
+    else
+        lattice-scale --inv-acoustic-scale=$LMWT "ark:gunzip -c $dir/lat.gz|" ark:- | \
+                      lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- | \
+                      lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- | \
+                      utils/int2sym.pl -f 2- $symtab | \
+                      $hyp_filtering_cmd > $dir/transcript.txt || exit 1;
+    fi
+
+    lattice-scale --inv-acoustic-scale=$LMWT "ark:gunzip -c $dir/lat.gz|" ark:- | \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- | \
+        lattice-to-ctm-conf --decode-mbr=$decode_mbr  ark:- - | \
+        utils/int2sym.pl -f 5 $symtab > $dir/transcript.ctm
+                      
+    
+done