From 5fe8b138ad97cfa1f6cdac1d5b64c8ebbef9205e Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 6 Feb 2018 20:28:58 -0500
Subject: [PATCH 01/10] [src] Add a nnet3 optimization that tries to replace
 commands ending in Multi with other commands.

---
 src/nnet3/nnet-computation.h     |  13 +-
 src/nnet3/nnet-optimize-utils.cc | 322 +++++++++++++++++++++++++++++++
 src/nnet3/nnet-optimize-utils.h  |  41 ++--
 src/nnet3/nnet-optimize.cc       |  14 +-
 src/nnet3/nnet-optimize.h        |   6 +
 src/nnet3/nnet-utils.cc          |  12 --
 6 files changed, 379 insertions(+), 29 deletions(-)
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index 0c6c690684a..64b1f4df9aa 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -395,12 +395,17 @@ struct NnetComputation {
   // These are owned here.
   std::vector<PrecomputedIndexesInfo> component_precomputed_indexes;
 
-  // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows.  contains row-indexes.
+  // Used in commands kAddRows, kAddToRows, kCopyRows, which
+  // contain indexes into this data-member.
+  // Each vector<int32> is a vector of row-indexes (with -1 usually treated as
+  // a special case meaning "don't do anything for this row" for add
+  // commands, or "use zero" for copy commands.
   std::vector<std::vector<int32> > indexes;
 
-  // used kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti.
-  // contains pairs (sub-matrix index, row index)- or (-1,-1) meaning don't
-  // do anything for this row.
+  // Used in commands kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti and
+  // kCopyToRowsMulti.  Contains pairs (sub-matrix index, row index)- or the
+  // special pair (-1,-1) meaning "don't do anything for this row" for add
+  // commands, or "use zero" for copy commands.
   std::vector<std::vector<std::pair<int32,int32> > > indexes_multi;
 
 
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index c53fba815fb..756ea45e894 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -2576,6 +2576,328 @@ bool SnipRowOps(NnetComputation *computation) {
 
 
 
+// This class implements the internals of the function SplitRowOps() which is
+// declared in nnet-optimize-utils.h.
+class RowOpsSplitter {
+ public:
+  RowOpsSplitter(NnetComputation *computation): computation_(computation) { }
+
+  // Attempts to perform the optimization.  Returns true if it made any change
+  // to the computation.
+  bool Split() {
+    return SplitIndexes() && SplitCommands();
+  }
+
+ private:
+
+  // This function sets up split_info_, which describes how we can split up
+  // the vectors that are elements of computation_->indexes_multi.
+  // It will return true if it successfully split at least one of those
+  // vectors, and false otherwise.
+  bool SplitIndexes();
+
+  // This function modifies the commands in the computation.  It returns
+  // true if it made any change.
+  bool SplitCommands();
+
+
+  // This function attempts to optimize the command in
+  // computation_->commands[command_index].  It returns true if it made any
+  // change.  If we are going to have to insert an extra command into the
+  // computation, this function will append an element to new_commands_.
+  bool SplitCommand(int32 command_index);
+
+  // Below, define a multi-index as an element of NnetComputation::indexes_multi,
+  // for example,
+  // const std::vector<std::pair<int32,int32> > &multi_index = computation_->indexes_multi[1];
+  // It is a list of pairs.
+
+  // This struct appears as an element of the list inside MultiIndexSplitInfo.
+  // It helps us describe how we can split up a multi-index (a list of pairs)
+  // into a sequence of ranges where the .first value is constant across the
+  // range.
+  struct SingleSplitInfo {
+    // 'offset' is the index into the vector of pairs that forms the
+    // start of this range.  In the example where we are splitting up
+    // ((10,2), (10,3), (10,4), (15,3), (15,5), (15,7))
+    // there would be two instances of struct SingleSplitInfo, with
+    // offset = 0 and offset = 3.
+    int32 offset;
+    // 'size' is the number of pairs in this range; in the example
+    // above, both 'size' elements would be 3.
+    int32 size;
+    // first_value is the value of the .first index throughout this range; in
+    // the example above, it would be 10 and 15 respectively.  It represents a
+    // submatrix index.
+    int32 first_value;
+
+    // initial_second_value is the minimum value of .second for any element in
+    // this range: it would be 2 and 3 respectively in the example above.
+    int32 min_second_value;
+
+    // second_value_range is the highest value of .second for any element in
+    // this range, plus one, minus min_second_value.  (It's the number of rows
+    // in the other submatrix of the operation).
+    int32 second_value_range;
+
+    // If the .second values in the range are consecutive then
+    // 'second_value_offsets' will be empty.  Otherwise it will
+    // be a vector of size 'size', containing numbers in the
+    // range 0 ... second_value_range - 1, such that
+    // min_second_value + second_value_offsets[i] gives
+    // the .second value at the corresponding position in the range.
+    // In the second range of the example above, the range
+    // consisting of ((15,3), (15,5), (15,7)), 'second_value_offsets
+    // would be the vector (0, 2, 4).
+    std::vector<int32> second_value_offsets;
+  };
+
+  // An instance of the struct MultiIndexSplitInfo will be created for each multi-index,
+  // i.e. for each element of  NnetComputation::indexes_multi.
+  struct MultiIndexSplitInfo {
+    // If we can split this multi-index into at most two ranges, this
+    // vector will be nonempty; otherwise it will be empty.
+    std::vector<SingleSplitInfo> splits;
+  };
+
+  // GetSplitInfo() attempts to take a range of a
+  // std::vector<std::pair<int32, int32> >, as represented by begin and end
+  // iterators, and to extract its information into an object of type
+  // SingleSplitInfo.  (all except for the .offset member, which will have
+  // been set by calling code).
+  // It return true if successful, and false otherwise.  The only reasons that
+  // it might return false are that the range contains -1's or does not contain
+  // all-identical .first members).
+  bool GetSplitInfo(std::vector<std::pair<int32, int32> >::const_iterator begin,
+                    std::vector<std::pair<int32, int32> >::const_iterator end,
+                    SingleSplitInfo *info);
+
+  // computation_ is the computation that we are modifying.
+  NnetComputation *computation_;
+  // split_info_ will contain information about how we can split up the members
+  // of computation_->indexes_multi into ranges.
+  std::vector<MultiIndexSplitInfo> split_info_;
+  // The following is a list of additional commands that we are going to insert
+  // into computation_, of the form (command-index, command) where command-index
+  // is a command index just before which we will insert the new command.
+  // (this is the format accepted by the function InsertCommands()).
+  std::vector<std::pair<int32, NnetComputation::Command> > new_commands_;
+
+};
+
+
+bool RowOpsSplitter::GetSplitInfo(
+    std::vector<std::pair<int32, int32> >::const_iterator begin,
+    std::vector<std::pair<int32, int32> >::const_iterator end,
+    SingleSplitInfo *info) {
+  // max_size_ratio must be > 1.0, and could in principle be a float.  It is
+  // there to prevent us from making changes to the computation which would end
+  // up wastefully launching too many kernels that would do nothing.
+  const int32 max_size_ratio = 2;
+
+  int32 size = end - begin;
+  KALDI_ASSERT(size != 0);
+  int32 first = begin->first;
+  if (first < 0)
+    return false;
+  info->size = size;
+  info->first_value = first;
+  int32 initial_second_value = begin->second,
+      min_second_value = initial_second_value,
+      max_second_value = initial_second_value;
+  info->second_value_offsets.resize(size);
+  bool is_consecutive = true;
+  for (int32 i = 0; i < size; i++) {
+    int32 second = begin[i].second;
+    if (begin[i].first != first || second < 0) return false;
+    info->second_value_offsets[i] = second;
+    if (second != initial_second_value + i)
+      is_consecutive = false;
+    if (second < min_second_value) min_second_value = second;
+    if (second > max_second_value) max_second_value = second;
+  }
+  info->min_second_value = min_second_value;
+  info->second_value_range = max_second_value + 1 - min_second_value;
+  if (info->second_value_range > size * max_size_ratio)
+    return false;
+  if (is_consecutive) {
+    info->second_value_offsets.clear();
+  } else {
+    for (int32 i = 0; i < size; i++)
+      info->second_value_offsets[i] -= min_second_value;
+  }
+  return true;
+}
+
+
+bool RowOpsSplitter::SplitIndexes() {
+  bool ans = false;
+  int32 num_indexes_multi = computation_->indexes_multi.size();
+  split_info_.resize(num_indexes_multi);
+  for (int32 i = 0; i < num_indexes_multi; i++) {
+    const std::vector<std::pair<int32,int32> > &multi_index =
+        computation_->indexes_multi[i];
+    MultiIndexSplitInfo &split_info = split_info_[i];
+
+    int32 num_pairs = multi_index.size();
+    KALDI_ASSERT(num_pairs > 0);
+    // 'split_point' will be set to the first index j for which
+    // multi_index[j-1].first != multi_index[j].first, or -1
+    // if no such j exists.
+    int32 split_point = -1, initial_first = multi_index[0].first;
+    for (int32 j = 1; j < num_pairs; j++) {
+      if (multi_index[j].first != initial_first) {
+        split_point = j;
+        break;
+      }
+    }
+    if (split_point == -1) {
+      split_info.splits.resize(1);
+      split_info.splits[0].offset = 0;
+      if (!GetSplitInfo(multi_index.begin(), multi_index.end(),
+                        &(split_info.splits[0]))) {
+        split_info.splits.clear();
+      } else {
+        ans = true;
+      }
+    } else {
+      split_info.splits.resize(2);
+      split_info.splits[0].offset = 0;
+      split_info.splits[1].offset = split_point;
+
+      std::vector<std::pair<int32,int32> >::const_iterator mid_iter =
+          multi_index.begin() + split_point;
+      if (!GetSplitInfo(multi_index.begin(), mid_iter,
+                        &(split_info.splits[0])) ||
+          !GetSplitInfo(mid_iter, multi_index.end(),
+                        &(split_info.splits[1]))) {
+        split_info.splits.clear();
+      } else {
+        ans = true;
+      }
+    }
+  }
+  return ans;
+}
+
+bool RowOpsSplitter::SplitCommand(int32 c) {
+  NnetComputation::Command &command = computation_->commands[c];
+  CommandType command_type = command.command_type;
+  // For commands that are not of the following four types, return false: we
+  // won't be changing these commands.
+  switch (command_type) {
+    case kAddRowsMulti: case kCopyRowsMulti:
+    case kAddToRowsMulti: case kCopyToRowsMulti: break;
+    default: return false;
+  }
+  int32 indexes_multi_index = command.arg2;
+  KALDI_ASSERT(indexes_multi_index <
+               static_cast<int32>(split_info_.size()));
+  const MultiIndexSplitInfo &split_info = split_info_[indexes_multi_index];
+  if (split_info.splits.empty())
+    return false;  // these indexes couldn't be split: e.g. they contained more
+                   // than two distinct .first elements, or there were other
+                   // reasons.
+
+  // we'll be splitting the command into either one or two pieces.
+  std::vector<NnetComputation::Command> split_commands(
+      split_info.splits.size());
+  for (size_t i = 0; i < split_info.splits.size(); i++) {
+    const SingleSplitInfo &split = split_info.splits[i];
+    NnetComputation::Command &command_out = split_commands[i];
+    command_out.alpha = command.alpha;
+    command_out.arg1 = computation_->NewSubMatrix(
+        command.arg1, split.offset, split.size, 0, -1);
+    command_out.arg2 = computation_->NewSubMatrix(
+        split.first_value, split.min_second_value,
+        split.second_value_range, 0, -1);
+
+    if (split.second_value_offsets.empty()) {
+      // The .second elements are consecutive.
+      switch (command_type) {
+        case kAddRowsMulti:
+          command_out.command_type = kMatrixAdd;
+          break;
+        case kCopyRowsMulti:
+          command_out.command_type = kMatrixCopy;
+          break;
+        case kAddToRowsMulti:
+          command_out.command_type = kMatrixAdd;
+          std::swap(command_out.arg1, command_out.arg2);
+          break;
+        case kCopyToRowsMulti:
+          command_out.command_type = kMatrixCopy;
+          std::swap(command_out.arg1, command_out.arg2);
+          break;
+        default:  // will never be reached.
+          break;
+      }
+    } else {
+      // Indexes are not consecutive: it needs to be a kAddRows or kCopyRows
+      // command.
+      command_out.arg3 = computation_->indexes.size();
+      switch (command_type) {
+        case kAddRowsMulti: case kCopyRowsMulti: {
+          command_out.command_type = (command_type == kAddRowsMulti ?
+                                      kAddRows : kCopyRows);
+          computation_->indexes.push_back(split.second_value_offsets);
+          break;
+        }
+        case kCopyToRowsMulti:  {
+          // We can't operate on this command because of what would happen
+          // with values of 'indexes' (see the variable in the block for
+          // kAddToRowsMulti) which were -1.  Rows of the output would be
+          // set to zero, which is not the behavior we want here; we'd want
+          // them to be unaffected.
+          return false;
+        }
+        case kAddToRowsMulti: {
+          command_out.command_type = kAddRows;
+          std::swap(command_out.arg1, command_out.arg2);
+          // invert the indexes.
+          std::vector<int32> indexes(split.second_value_range, -1);
+          for (int32 i = 0; i < split.size; i++) {
+            // the following assert should always succeed because the
+            // AddToRowsMulti and CopyToRowsMulti should never have
+            // duplicate destinations in their indexes.
+            KALDI_ASSERT(indexes[split.second_value_offsets[i]] >= 0);
+            indexes[split.second_value_offsets[i]] = i;
+          }
+          computation_->indexes.push_back(indexes);
+          break;
+        }
+        default:
+          KALDI_ERR << "Code error: un-handled case.";
+      }
+    }
+  }
+  command = split_commands[0];
+  // note: for now, split_commands.size() will be 1 or 2.
+  for (size_t i = 1; i < split_commands.size(); i++) {
+    new_commands_.resize(new_commands_.size() + 1);
+    // we'll want to insert this command right after command c,
+    // which is the same as just before command c + 1.
+    new_commands_.back().first = c + 1;
+    new_commands_.back().second = split_commands[i];
+  }
+  return true;  // We made a change.
+}
+
+bool RowOpsSplitter::SplitCommands() {
+  bool ans = false;
+  int32 num_commands = computation_->commands.size();
+  for (int32 c = 0; c < num_commands; c++)
+    if (SplitCommand(c))
+      ans = true;
+  if (!new_commands_.empty())
+    InsertCommands(&new_commands_, computation_);
+  return ans;
+}
+
+bool SplitRowOps(NnetComputation *computation) {
+  RowOpsSplitter splitter(computation);
+  return splitter.Split();
+}
 
 
 /*
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 703f43af095..32adf9e3e19 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -455,6 +455,23 @@ bool ReplaceRowWithMatrixOps(NnetComputation *computation);
 /// computation->indexes.
 bool SnipRowOps(NnetComputation *computation);
 
+
+/// This function detects cases where commands of type kAddRowsMulti,
+/// kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that
+/// correspond to at most two submatrices, in two distinct ranges without gaps
+/// filled by -1's, and could be converted to at most two commands of type
+/// kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows.  (Note: it's important that
+/// this optimization takes place after SnipRowOps, because it doesn't remove
+/// the -1's from the edges of the indexes, it relies on that operation doing
+/// so).  The "without-gaps" stipulation is just for convenience of
+/// implementation, to have fewer cases to worry about.
+///
+/// This function returns true if it made any changes to the computation; if it
+/// returns true, then after calling this you should at some point do
+/// RenumberComputation(), which will remove any now-unused members of
+/// computation->indexes.
+bool SplitRowOps(NnetComputation *computation);
+
 /// This function detects submatrices and matrices that are never used (e.g. due
 /// to changes made in other optimization code), and members of indexes,
 /// indexes_multi and indexes_ranges that are unused or are duplicates, and memo
@@ -535,18 +552,18 @@ void IdentifyIndexesRangesArgs(std::vector<NnetComputation::Command> *commands,
                                std::vector<int32*> *indexes_ranges_args);
 
 /// Inserts commands into the computation at the requested places.  'commands'
-///  is a list of pairs (command-index, command) that is expected to be sorted
-///  on command-index.  For each entry (c, command) in 'commands', 'command' is
-///  inserted into 'computation' just *before* the command that (at entry) is in
-///  computation->commands[c].  If there are multiple pairs with the same index
-///  c, they will remain in the same order in which they were present in
-///  'commands'; however, 'commands' does not have to be sorted on 'c'.
-///  As a special case, if c == computation->commands.size(), the
-///  corresponding commands are inserted at the beginning of the computation.
-///  This function will appropriately renumber the argument of the kGotoLabel
-///  command of any 'looped' computation.  Command indexes c in commands[*].first
-///  must be in the range [0, computation->commands.size()].
-///  This function may modify 'commands' by sorting it.
+/// is a list of pairs (command-index, command) that is expected to be sorted on
+/// command-index.  For each entry (c, command) in 'commands', 'command' is
+/// inserted into 'computation' just *before* the command that (at entry) is in
+/// computation->commands[c].  If there are multiple pairs with the same index
+/// c, they will remain in the same order in which they were present in
+/// 'commands'; however, 'commands' does not have to be sorted on 'c'.  As a
+/// special case, if c == computation->commands.size(), the corresponding
+/// commands are inserted at the beginning of the computation.  This function
+/// will appropriately renumber the argument of the kGotoLabel command of any
+/// 'looped' computation.  Command indexes c in commands[*].first must be in the
+/// range [0, computation->commands.size()].  This function may modify
+/// 'commands' by sorting it.
 void InsertCommands(
     std::vector<std::pair<int32, NnetComputation::Command> > *commands,
     NnetComputation *computation);
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index d614afce7d0..ecce196801b 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -41,6 +41,14 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   if (tok == "<OptimizeRowOps>") {
     ReadBasicType(is, binary, &optimize_row_ops);
     ReadToken(is, binary, &tok);
+  } else {
+    optimize_row_ops = true;
+  }
+  if (tok == "<SplitRowOps>") {
+    ReadBasicType(is, binary, &split_row_ops);
+    ReadToken(is, binary, &tok);
+  } else {
+    split_row_ops = true;
   }
   KALDI_ASSERT(tok == "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
@@ -516,12 +524,16 @@ void Optimize(const NnetOptimizeOptions &config,
   }
 
 
-  if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) {
+  if (config.optimize &&  (config.snip_row_ops || config.optimize_row_ops ||
+                           config.split_row_ops)) {
     bool must_renumber = false;
     if (config.snip_row_ops && SnipRowOps(computation))
       must_renumber = true;
+    if (config.split_row_ops && SplitRowOps(computation))
+      must_renumber = true;
     if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation))
       must_renumber = true;
+
     if (must_renumber) {
       RenumberComputation(computation);
       if (GetVerboseLevel() >= 3)
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index 31872e46b72..a07c5490c5c 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -39,6 +39,7 @@ struct NnetOptimizeOptions {
   bool propagate_in_place;
   bool backprop_in_place;
   bool optimize_row_ops;
+  bool split_row_ops;
   bool extend_matrices;
   bool convert_addition;
   bool remove_assignments;
@@ -63,6 +64,7 @@ struct NnetOptimizeOptions {
       propagate_in_place(true),
       backprop_in_place(true),
       optimize_row_ops(true),
+      split_row_ops(true),
       extend_matrices(true),
       convert_addition(true),
       remove_assignments(true),
@@ -95,6 +97,10 @@ struct NnetOptimizeOptions {
     opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to "
                    "disable certain optimizations that act on operations of "
                    "type *Row*.");
+    opts->Register("split-row-ops", &split_row_ops, "Set to false to disable "
+                   "an optimization that may replace some operations of type "
+                   "kCopyRowsMulti or kAddRowsMulti with up to two simpler "
+                   "operations.");
     opts->Register("convert-addition", &convert_addition, "Set to false to "
                    "disable the optimization that converts Add commands into "
                    "Copy commands wherever possible.");
diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc
index 59885cf70b2..b0c81aae2b8 100644
--- a/src/nnet3/nnet-utils.cc
+++ b/src/nnet3/nnet-utils.cc
@@ -1805,19 +1805,7 @@ class ModelCollapser {
 void CollapseModel(const CollapseModelConfig &config,
                    Nnet *nnet) {
   ModelCollapser c(config, nnet);
-  std::string info_before_collapse;
-  if (GetVerboseLevel() >= 4)
-    info_before_collapse = nnet->Info();
   c.Collapse();
-  if (GetVerboseLevel() >= 4) {
-    std::string info_after_collapse = nnet->Info();
-    if (info_after_collapse != info_before_collapse) {
-      KALDI_VLOG(4) << "Collapsing model: info before collapse was: "
-                    << info_before_collapse
-                    << ", info after collapse was:"
-                    << info_after_collapse;
-    }
-  }
 }
 
 bool UpdateNnetWithMaxChange(const Nnet &delta_nnet,

From 458dee84826fa1f6a7c3c352ba5a40d850466908 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 11 Feb 2018 02:15:11 -0500
Subject: [PATCH 02/10] [scripts] Add dropout-per-dim option

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 33 ++++++++++++++++---
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 47a5dfdf082..a3dfa89cf0e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -686,7 +686,9 @@ def set_default_configs(self):
                        'ng-linear-options': '',    # only affects bottleneck layers.
                        'dropout-proportion': 0.5,  # dropout-proportion only
                                                    # affects layers with
-                                                   # 'dropout' in the name.
+                                                   # 'dropout' in the name
+                       'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
+                                                  # mask is shared across time.
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -862,10 +864,31 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                line = ('component name={0}.{1} type=DropoutComponent '
-                        'dim={2} dropout-proportion={3}'.format(
-                            self.name, nonlinearity, output_dim,
-                            self.config['dropout-proportion']))
+                if not self.config['dropout-per-dim']:
+                    line = ('component name={0}.{1} type=DropoutComponent '
+                            'dim={2} dropout-proportion={3}'.format(
+                                self.name, nonlinearity, output_dim,
+                                self.config['dropout-proportion']))
+                else:
+                    line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
+                            'output-dim={1} dropout-proportion={2}'.format(
+                                self.name, output_dim, self.config['dropout-proportion']))
+                    configs.append(line)
+                    # note: the input to the dropout_mask component is never used, it's
+                    # just syntactically required.
+                    line = ('component-node name={0}.dropout_mask component={0}.dropout_mask '
+                            'input={1}'.format(self.name, cur_node))
+                    configs.append(line)
+                    line = ('component name={0}.dropout type=ElementwiseProductComponent '
+                            'input-dim={1} output-dim={2} '.format(
+                                self.name, 2 * output_dim, output_dim))
+                    configs.append(line)
+                    line = ('component-node name={0}.dropout component={0}.dropout '
+                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
+                            ''.format(self.name, cur_node))
+                    configs.append(line)
+                    cur_node = '{0}.dropout'.format(self.name)
+                    continue
 
             else:
                 raise RuntimeError("Unknown nonlinearity type: {0}"

From 2181f4fc64ce5b6a3d5f42dd0172e95050491287 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 13 Feb 2018 22:22:55 -0500
Subject: [PATCH 03/10] [src,scripts] Add 'continuous' option for dropout.

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  9 ++-
 src/nnet3/nnet-general-component.cc           | 75 +++++++++++++------
 src/nnet3/nnet-general-component.h            |  9 ++-
 src/nnet3/nnet-simple-component.h             |  2 +-
 4 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..132fbdb6d82 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,7 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False,
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
                     line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
+                            'output-dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
                     configs.append(line)
                     # note: the input to the dropout_mask component is never used, it's
                     # just syntactically required.
@@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                     line = ('component-node name={0}.dropout component={0}.dropout '
                             'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
                             ''.format(self.name, cur_node))
+
+
                     configs.append(line)
                     cur_node = '{0}.dropout'.format(self.name)
                     continue
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..1ccb3d254fc 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..86614a1847f 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 

From 99fcff919f625c79e346317c8cd0d70f3a60e25f Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 14 Feb 2018 23:35:59 -0500
Subject: [PATCH 04/10] [src,scripts] adding script-level and code support for
 continuous dropout

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  9 ++-
 src/nnet3/nnet-general-component.cc           | 75 +++++++++++++------
 src/nnet3/nnet-general-component.h            |  9 ++-
 src/nnet3/nnet-simple-component.h             |  2 +-
 4 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..132fbdb6d82 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,7 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False,
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
                     line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
+                            'output-dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
                     configs.append(line)
                     # note: the input to the dropout_mask component is never used, it's
                     # just syntactically required.
@@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                     line = ('component-node name={0}.dropout component={0}.dropout '
                             'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
                             ''.format(self.name, cur_node))
+
+
                     configs.append(line)
                     cur_node = '{0}.dropout'.format(self.name)
                     continue
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..1ccb3d254fc 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..86614a1847f 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 

From 9bddde88964e8ab5830995b4c346476cd90f4c15 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 16 Feb 2018 22:09:45 -0500
Subject: [PATCH 05/10] [src] Small fix to cu-kernels.cu RE compression kernel

---
 src/cudamatrix/cu-kernels.cu | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index b0468b7fa7c..ae7e25b716d 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -3601,7 +3601,7 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
       src_index = i + j * dim.stride;
   const int min_value = minimum_integer_value<I>(),
       max_value = maximum_integer_value<I>();
-  int16_t compressed_value;
+  int compressed_value;
   int ok = (i < dim.cols && j < dim.rows);
   if  (ok) {
     float f = src[src_index];
@@ -3611,7 +3611,6 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim,
     // range of 'int', and if it fails, we've probably already catastrophically
     // diverged.
     int i = __float2int_rn(f * inv_scale);
-    // note: SignedInt will be int8 or (more likely) int16.
     if (i < min_value) compressed_value = min_value;
     else if (i > max_value) compressed_value = max_value;
     else compressed_value = i;

From 82d2e7505ed5dc163f8205830fb4a9db88f13456 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 13 Feb 2018 22:22:55 -0500
Subject: [PATCH 06/10] [src,scripts] Add 'continuous' option for dropout.

---
 .../steps/libs/nnet3/xconfig/basic_layers.py  |  9 ++-
 src/nnet3/nnet-general-component.cc           | 75 +++++++++++++------
 src/nnet3/nnet-general-component.h            |  9 ++-
 src/nnet3/nnet-simple-component.h             |  2 +-
 4 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index a3dfa89cf0e..132fbdb6d82 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -689,6 +689,7 @@ def set_default_configs(self):
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
                                                   # mask is shared across time.
+                       'dropout-per-dim-continuous':  False,
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
                        # code, just passed through (but not if left at '').
@@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                                 self.name, nonlinearity, output_dim,
                                 self.config['dropout-proportion']))
                 else:
+                    continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else ''
                     line = ('component name={0}.dropout_mask type=DropoutMaskComponent '
-                            'output-dim={1} dropout-proportion={2}'.format(
-                                self.name, output_dim, self.config['dropout-proportion']))
+                            'output-dim={1} dropout-proportion={2} {3}'.format(
+                                self.name, output_dim, self.config['dropout-proportion'],
+                                continuous_opt))
                     configs.append(line)
                     # note: the input to the dropout_mask component is never used, it's
                     # just syntactically required.
@@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                     line = ('component-node name={0}.dropout component={0}.dropout '
                             'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
                             ''.format(self.name, cur_node))
+
+
                     configs.append(line)
                     cur_node = '{0}.dropout'.format(self.name)
                     continue
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index dd6e950a7d1..1ccb3d254fc 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const {
   stream << Type()
          << ", output-dim=" << output_dim_
          << ", dropout-proportion=" << dropout_proportion_;
+  if (continuous_)
+    stream << ", continuous=true";
   return stream.str();
 }
 
 DropoutMaskComponent::DropoutMaskComponent():
-    output_dim_(-1), dropout_proportion_(0.5) { }
+    output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { }
 
 DropoutMaskComponent::DropoutMaskComponent(
     const DropoutMaskComponent &other):
     output_dim_(other.output_dim_),
-    dropout_proportion_(other.dropout_proportion_) { }
+    dropout_proportion_(other.dropout_proportion_),
+    continuous_(other.continuous_) { }
 
 void* DropoutMaskComponent::Propagate(
     const ComponentPrecomputedIndexes *indexes,
@@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate(
     out->Set(1.0);
     return NULL;
   }
+
+  if (continuous_) {
+    if (test_mode_) {
+      out->Set(1.0);
+    } else {
+      const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
+      out->Scale(dropout_proportion * 4.0);
+      // make the expected value 1.0.
+      out->Add(1.0 - (2.0 * dropout_proportion));
+    }
+    return NULL;
+  }
+
   if (test_mode_) {
     out->Set(1.0 - dropout_proportion);
     return NULL;
   }
+
   const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(out);
   out->Add(-dropout_proportion);
   out->ApplyHeaviside();
-  // To generate data where it's never the case that both of the dimensions
-  // for a row are zero, we generate uniformly distributed data (call this u_i),
-  // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
-  //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
-  int32 num_rows = out->NumRows();
-  // later we may make this a bit more efficient.
-  CuVector<BaseFloat> temp(num_rows, kUndefined);
-  const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
-  temp.Add(-dropout_proportion);
-  out->CopyColFromVec(temp, 0);
-  temp.Add(-1.0 + (2.0 * dropout_proportion));
-  // Now, 'temp' contains the original uniformly-distributed data plus
-  // -(1 - dropout_proportion).
-  temp.Scale(-1.0);
-  out->CopyColFromVec(temp, 1);
-  out->ApplyHeaviside();
+
+  if (out->NumCols() == 2 || out->NumCols() == 3) {
+    // This is a kind of special case relevant to LSTms.
+    // To generate data where it's never the case that both of the dimensions
+    // for a row are zero, we generate uniformly distributed data (call this u_i),
+    // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1)
+    //                and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1)
+    int32 num_rows = out->NumRows();
+    // later we may make this a bit more efficient.
+    CuVector<BaseFloat> temp(num_rows, kUndefined);
+    const_cast<CuRand<BaseFloat>&>(random_generator_).RandUniform(&temp);
+    temp.Add(-dropout_proportion);
+    out->CopyColFromVec(temp, 0);
+    temp.Add(-1.0 + (2.0 * dropout_proportion));
+    // Now, 'temp' contains the original uniformly-distributed data plus
+    // -(1 - dropout_proportion).
+    temp.Scale(-1.0);
+    out->CopyColFromVec(temp, 1);
+    out->ApplyHeaviside();
+  }
   return NULL;
 }
 
@@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &output_dim_);
   ExpectToken(is, binary, "<DropoutProportion>");
   ReadBasicType(is, binary, &dropout_proportion_);
-  std::string token;
-  ReadToken(is, binary, &token);
-  if (token == "<TestMode>") {
+  if (PeekToken(is, binary) == 'T') {
+    ExpectToken(is, binary, "<TestMode>");
     ReadBasicType(is, binary, &test_mode_);  // read test mode
-    ExpectToken(is, binary, "</DropoutMaskComponent>");
   } else {
     test_mode_ = false;
-    KALDI_ASSERT(token == "</DropoutMaskComponent>");
   }
+  if (PeekToken(is, binary) == 'C') {
+    ExpectToken(is, binary, "<Continuous>");
+    continuous_ = true;
+  } else {
+    continuous_ = false;
+  }
+  ExpectToken(is, binary, "</DropoutMaskComponent>");
 }
 
 
@@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, dropout_proportion_);
   WriteToken(os, binary, "<TestMode>");
   WriteBasicType(os, binary, test_mode_);
+  if (continuous_)
+    WriteToken(os, binary, "<Continuous>");
   WriteToken(os, binary, "</DropoutMaskComponent>");
 }
 
@@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) {
   KALDI_ASSERT(ok && output_dim_ > 0);
   dropout_proportion_ = 0.5;
   cfl->GetValue("dropout-proportion", &dropout_proportion_);
+  continuous_ = false;
+  cfl->GetValue("continuous", &continuous_);
   test_mode_ = false;
   cfl->GetValue("test-mode", &test_mode_);
 }
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index 36829329d66..86614a1847f 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent {
   virtual std::string Info() const;
 
   // possible parameter values with their defaults:
-  // dropout-proportion=0.5 output-dim=-1
+  // dropout-proportion=0.5 output-dim=-1 continuous=false
+  // With the 'continous=false' option (the default), it generates
+  // 0 with probability 'dropout-proportion' and 1 otherwise.
+  // With 'continuous=true' it outputs 1 plus dropout-proportion times
+  //  a value uniformly distributed on [-2, 2].  (e.g. if dropout-proportion is
+  // 0.5, this would amount to a value uniformly distributed on [0,2].)
   virtual void InitFromConfig(ConfigLine *cfl);
 
   DropoutMaskComponent();
@@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent {
 
   BaseFloat dropout_proportion_;
 
+  bool continuous_;
+
   const DropoutMaskComponent &operator
   = (const DropoutMaskComponent &other); // Disallow.
 };
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index b1eb30a55bf..2d776180533 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent {
   // it contains the 3 diagonal parameter matrices w_i, w_f and w_o.
   CuMatrix<BaseFloat> params_;
 
-  // If true, we expect an extra 2 dimensions on the input, for dropout masks
+  // If true, we expect an extra 3 dimensions on the input, for dropout masks
   // for i_t and f_t.
   bool use_dropout_;
 

From 83b97da1d6361651c9d19dfe8070feb596766c7b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 18 Feb 2018 17:31:08 -0500
Subject: [PATCH 07/10] [src] Prevent NaNs from appearing in chain
 forward-backward when nnet output is a bit out of range.

---
 .../s5c/local/chain/tuning/run_tdnn_7l.sh     |  2 +-
 .../steps/libs/nnet3/xconfig/basic_layers.py  | 20 ++++++++----
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py |  1 +
 src/chain/chain-denominator.cc                |  5 ++-
 src/cudamatrix/cu-kernels-ansi.h              |  4 +++
 src/cudamatrix/cu-kernels.cu                  | 32 +++++++++++++++++++
 src/cudamatrix/cu-kernels.h                   |  8 +++++
 src/cudamatrix/cu-matrix-test.cc              | 25 +++++++++++++++
 src/cudamatrix/cu-matrix.cc                   | 31 ++++++++++++++++++
 src/cudamatrix/cu-matrix.h                    |  7 ++++
 src/nnet3/nnet-general-component.cc           |  2 +-
 11 files changed, 128 insertions(+), 9 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index f7681a743e1..43073895382 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -10,7 +10,7 @@
 #Final valid prob         -0.113     -0.116
 #Final train prob (xent)  -1.25      -1.38
 #Final valid prob (xent)  -1.36      -1.48
-#Time consuming one iter  53.56s     48.18s  
+#Time consuming one iter  53.56s     48.18s
 #Time reduction percent   10.1%
 set -e
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index 132fbdb6d82..2cacbf43d16 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -688,7 +688,9 @@ def set_default_configs(self):
                                                    # affects layers with
                                                    # 'dropout' in the name
                        'dropout-per-dim': False,  # if dropout-per-dim=true, the dropout
-                                                  # mask is shared across time.
+                                                  # mask is shared across time.  Or you can set dropout-period
+                       'dropout-period': 0,  # if set to a nonzero value (e.g. 10), we'll share the
+                                               # dropout mask across chunks of time values.
                        'dropout-per-dim-continuous':  False,
                        'add-log-stddev': False,
                        # the following are not really inspected by this level of
@@ -865,7 +867,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                         ''.format(self.name, nonlinearity, output_dim))
 
             elif nonlinearity == 'dropout':
-                if not self.config['dropout-per-dim']:
+                if not self.config['dropout-per-dim'] and self.config['dropout-period'] == 0:
                     line = ('component name={0}.{1} type=DropoutComponent '
                             'dim={2} dropout-proportion={3}'.format(
                                 self.name, nonlinearity, output_dim,
@@ -886,10 +888,16 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
                             'input-dim={1} output-dim={2} '.format(
                                 self.name, 2 * output_dim, output_dim))
                     configs.append(line)
-                    line = ('component-node name={0}.dropout component={0}.dropout '
-                            'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
-                            ''.format(self.name, cur_node))
-
+                    if self.config['dropout-per-dim']:
+                        line = ('component-node name={0}.dropout component={0}.dropout '
+                                'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))'
+                                ''.format(self.name, cur_node))
+                    else:
+                        dropout_period = self.config['dropout-period']
+                        assert dropout_period > 0
+                        line = ('component-node name={0}.dropout component={0}.dropout '
+                                'input=Append({1}, Round({0}.dropout_mask, {2}))'
+                                ''.format(self.name, cur_node, dropout_period))
 
                     configs.append(line)
                     cur_node = '{0}.dropout'.format(self.name)
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 6fbde1fbbcc..fe25d95df91 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -23,6 +23,7 @@
         'relu-layer' : xlayers.XconfigBasicLayer,
         'relu-renorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
+        'relu-dropout-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-dropout-layer': xlayers.XconfigBasicLayer,
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,
diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc
index 620ea873eb7..1e2eb42a690 100644
--- a/src/chain/chain-denominator.cc
+++ b/src/chain/chain-denominator.cc
@@ -57,7 +57,10 @@ DenominatorComputation::DenominatorComputation(
                  num_sequences_).SetZero();
 
   KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0);
-  exp_nnet_output_transposed_.ApplyExp();
+  // We limit the nnet output to the range [-30,30] before doing the exp;
+  // this avoids NaNs appearing in the forward-backward computation, which
+  // is not done in log space.
+  exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0);
 }
 
 
diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h
index 8ab03c7e14e..ac2f15aa2e2 100644
--- a/src/cudamatrix/cu-kernels-ansi.h
+++ b/src/cudamatrix/cu-kernels-ansi.h
@@ -194,6 +194,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val,
                          MatrixDim d);
 void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d);
 void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d);
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit);
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit);
 void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim,
                              const double* in, int in_stride);
 void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim,
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index ae7e25b716d..6ca8b107dad 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) {
   }
 }
 
+template<typename Real>
+__global__
+static void _apply_exp_limited(Real* mat, MatrixDim d,
+                               Real lower_limit, Real upper_limit) {
+  int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x;
+  int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y;
+  int32_cuda index = i + j * d.stride;
+  if (i < d.cols && j < d.rows) {
+    Real x = mat[index];
+    // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that
+    // nan's will be set to the lower-limit.
+    if (!(x >= lower_limit))
+      x = lower_limit;
+    else if (x > upper_limit)
+      x = upper_limit;
+    mat[index] = exp(x);
+  }
+}
+
+
 template<typename Real>
 __global__
 static void _scale_diag_packed(Real* mat, Real value, int dim) {
@@ -3717,6 +3737,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                             float lower_limit, float upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
 void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
@@ -4407,6 +4432,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
   _apply_exp<<<Gr,Bl>>>(mat,d);
 }
 
+void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                             double lower_limit, double upper_limit) {
+  _apply_exp_limited<<<Gr,Bl>>>(mat, d, lower_limit, upper_limit);
+}
+
+
+
 void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) {
   _apply_pow<<<Gr,Bl>>>(mat, power, d);
 }
diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h
index 3518e0c71ed..871c959f5d4 100644
--- a/src/cudamatrix/cu-kernels.h
+++ b/src/cudamatrix/cu-kernels.h
@@ -335,6 +335,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) {
 inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) {
   cudaF_apply_exp(Gr, Bl, mat, d);
 }
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d,
+                                   double lower_limit, double upper_limit) {
+  cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
+inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d,
+                                   float lower_limit, float upper_limit) {
+  cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit);
+}
 inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out,
                                    MatrixDim out_dim, const double* in,
                                    int in_stride) {
diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc
index 909e5552a35..baee5f98e60 100644
--- a/src/cudamatrix/cu-matrix-test.cc
+++ b/src/cudamatrix/cu-matrix-test.cc
@@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() {
 }
 
 
+template<typename Real>
+static void UnitTestCuMatrixApplyExpLimited() {
+  int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20;
+  Matrix<Real> H(M, N);
+  H.SetRandn();
+
+
+  BaseFloat lower_limit = -0.2, upper_limit = 0.2;
+
+  CuMatrix<Real> D(H);
+
+  D.ApplyExpLimited(lower_limit, upper_limit);
+
+
+  H.ApplyFloor(lower_limit);
+  H.ApplyCeiling(upper_limit);
+  H.ApplyExp();
+
+  Matrix<Real> H2(D);
+
+  AssertEqual(H,H2);
+}
+
+
 
 template<typename Real>
 static void UnitTestCuMatrixSigmoid() {
@@ -2859,6 +2883,7 @@ static void UnitTestCuMatrixEqualElementMask() {
 
 template<typename Real> void CudaMatrixUnitTest() {
   UnitTestCuMatrixApplyExpSpecial<Real>();
+  UnitTestCuMatrixApplyExpLimited<Real>();
   UnitTextCuMatrixAddSmatMat<Real>();
   UnitTextCuMatrixAddMatSmat<Real>();
   UnitTextCuMatrixAddSmat<Real>();
diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc
index 813c5e75d14..a34804f534e 100644
--- a/src/cudamatrix/cu-matrix.cc
+++ b/src/cudamatrix/cu-matrix.cc
@@ -2498,6 +2498,37 @@ void CuMatrixBase<Real>::ApplyExp() {
   }
 }
 
+template<typename Real>
+void CuMatrixBase<Real>::ApplyExpLimited(Real lower_limit, Real upper_limit) {
+  KALDI_ASSERT(upper_limit > lower_limit);
+#if HAVE_CUDA == 1
+  if (CuDevice::Instantiate().Enabled()) {
+    CuTimer tim;
+    dim3 dimGrid, dimBlock;
+    GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(),
+                                          &dimGrid, &dimBlock);
+    cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit);
+    CU_SAFE_CALL(cudaGetLastError());
+    CuDevice::Instantiate().AccuProfile(__func__, tim);
+  } else
+#endif
+  {
+    int32 num_rows = num_rows_, num_cols = num_cols_;
+    for (int32 r = 0; r < num_rows; r++) {
+      Real *row_data = this->RowData(r);
+      for (int32 c = 0; c < num_cols; c++) {
+        Real x = row_data[c];
+        if (!(x >= lower_limit))
+          x = lower_limit;
+        if (x > upper_limit)
+          x = upper_limit;
+        row_data[c] = Exp(x);
+      }
+    }
+  }
+}
+
+
 template<typename Real>
 void CuMatrixBase<Real>::ApplyExpSpecial() {
 #if HAVE_CUDA == 1
diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h
index 7c3a2a2e11f..0fa022e1569 100644
--- a/src/cudamatrix/cu-matrix.h
+++ b/src/cudamatrix/cu-matrix.h
@@ -390,6 +390,13 @@ class CuMatrixBase {
   void ApplyCeiling(Real ceiling_val);
   void ApplyExp();
 
+
+  /// This is equivalent to running:
+  /// ApplyFloor(lower_limit);
+  /// ApplyCeiling(upper_limit);
+  /// ApplyExp()
+  void ApplyExpLimited(Real lower_limit, Real upper_limit);
+
   /// For each element x of the matrix, set it to
   /// (x < 0 ? exp(x) : x + 1).  This function is used
   /// in our RNNLM training.
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 1ccb3d254fc..20706c11ad8 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate(
   BaseFloat dropout_proportion = dropout_proportion_;
   KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0);
 
-  if (dropout_proportion_ == 0) {
+  if (dropout_proportion == 0) {
     out->Set(1.0);
     return NULL;
   }

From 2b24aaa2ba93a2330d5649f9ee25c01c94aac910 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 24 Mar 2018 23:47:22 -0400
Subject: [PATCH 08/10] [egs] revert some changes

---
 egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh | 2 +-
 egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py  | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
index 43073895382..f7681a743e1 100644
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh
@@ -10,7 +10,7 @@
 #Final valid prob         -0.113     -0.116
 #Final train prob (xent)  -1.25      -1.38
 #Final valid prob (xent)  -1.36      -1.48
-#Time consuming one iter  53.56s     48.18s
+#Time consuming one iter  53.56s     48.18s  
 #Time reduction percent   10.1%
 set -e
 
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
index 0019f5700c3..99911b39fb2 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py
@@ -23,7 +23,6 @@
         'relu-layer' : xlayers.XconfigBasicLayer,
         'relu-renorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer,
-        'relu-dropout-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-dropout-layer': xlayers.XconfigBasicLayer,
         'relu-batchnorm-layer' : xlayers.XconfigBasicLayer,
         'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer,

From bfdb7a7b100fd72fec4c6501ab6950db81b06295 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 24 Mar 2018 23:36:19 -0400
Subject: [PATCH 09/10] [egs] some script updates

Conflicts:
	egs/swbd/s5c/local/chain/tuning/run_tdnn_7m26h2.sh
	egs/swbd/s5c/local/chain/tuning/run_tdnn_7m26l.sh
---
 egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
index 753dfc632ba..b927cc86823 100755
--- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
+++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh
@@ -18,7 +18,7 @@
 #
 #
 # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp
-# System                tdnn7n_sp tdnn7m26j_sp
+# System                tdnn7n_sp tdnn7m26o_sp
 # WER on train_dev(tg)      12.18     11.74
 # WER on train_dev(fg)      11.12     10.69
 # WER on eval2000(tg)        14.9      14.6

From 78499c13befce4787edbcd2a255a9e58b6b5f4fd Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sat, 24 Mar 2018 23:30:57 -0400
Subject: [PATCH 10/10] [src] documentation improvements

---
 src/nnet3/nnet-simple-component.cc |  6 +----
 src/nnet3/nnet-simple-component.h  | 38 ++++++++++++++++++++++++++++--
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index f9f286aaed2..4eb078c0fcb 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) {
                    // for the preconditioner actually exceeds the memory for the
                    // parameters (by "rank").
       update_period = 10;
-  BaseFloat num_samples_history = 2000.0, alpha = 4.0,
-      max_change_per_minibatch = 0.0;
+  BaseFloat num_samples_history = 2000.0, alpha = 4.0;
   cfl->GetValue("rank", &rank);
   cfl->GetValue("update-period", &update_period);
   cfl->GetValue("num-samples-history", &num_samples_history);
   cfl->GetValue("alpha", &alpha);
-  cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch);
-  if (max_change_per_minibatch != 0.0)
-    KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'";
   InitLearningRatesFromConfig(cfl);
   std::string filename;
   // Accepts "scales" config (for filename) or "dim" -> random init, for testing.
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 9d438678f5d..3929c253aab 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1446,6 +1446,19 @@ class PermuteComponent: public Component {
    trainable scale; it's like a linear component with a diagonal matrix.  This
    version (and its child class NaturalGradientPerElementScaleComponent)
    requires the input for backprop.  See also ScaleAndOffsetComponent.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
 */
 class PerElementScaleComponent: public UpdatableComponent {
  public:
@@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent {
 
 
 
-// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
-// it uses a natural gradient update for the per-element scales.
+/**
+   NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but
+   it uses a natural gradient update for the per-element scales.
+
+   Accepted values on its config line, with defaults if applicable:
+
+     vector           If specified, the offsets will be read from this file ('vector'
+                      is interpreted as an rxfilename).
+
+     dim              The dimension that this component inputs and outputs.
+                      Only required if 'vector' is not specified.
+
+     param-mean=1.0   Mean of randomly initialized offset parameters; should only
+                      be supplied if 'vector' is not supplied.
+     param-stddev=0.0 Standard deviation of randomly initialized offset parameters;
+                      should only be supplied if 'vector' is not supplied.
+
+  And the natural-gradient-related configuration values:
+      rank=8
+      update-period=10
+      num-samples-history=2000.0
+      alpha=4.0
+*/
 class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent {
  public: