From 5c9a539312ef02cc8b9969d592026a5a29956c25 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 10 May 2017 16:24:20 -0400
Subject: [PATCH] [src] Fix bug found by Hainan and Yiming regarding looped
 computation.

---
 src/nnet3/nnet-analyze.cc        |   7 +-
 src/nnet3/nnet-analyze.h         |  12 ++--
 src/nnet3/nnet-optimize-utils.cc |  91 +++++++++++---------------
 src/nnet3/nnet-optimize.cc       | 108 ++++---------------------------
 4 files changed, 59 insertions(+), 159 deletions(-)
diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc
index 98e7e7724c6..9d6dba371a7 100644
--- a/src/nnet3/nnet-analyze.cc
+++ b/src/nnet3/nnet-analyze.cc
@@ -1280,12 +1280,13 @@ void Analyzer::Init(const Nnet &nnet, const NnetComputation &computation) {
                         &matrix_accesses);
 }
 
-void GetSegmentEnds(const NnetComputation &computation,
-                    std::vector<int32> *command_indexes) {
+void GetCommandsOfType(const NnetComputation &computation,
+                       CommandType t,
+                       std::vector<int32> *command_indexes) {
   int32 num_commands = computation.commands.size();
   command_indexes->clear();
   for (int32 c = 0; c < num_commands; c++)
-    if (computation.commands[c].command_type == kNoOperationMarker)
+    if (computation.commands[c].command_type == t)
       command_indexes->push_back(c);
 }
 
diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h
index 2b71a8c2411..532648dd5f2 100644
--- a/src/nnet3/nnet-analyze.h
+++ b/src/nnet3/nnet-analyze.h
@@ -423,13 +423,11 @@ class ComputationChecker {
 };
 
 
-/// This utility function works out from a computation, the locations of the
-/// 'segment ends'.  This is useful for online compilation, where the
-/// computation has multiple segments corresponding to new pieces of input data
-/// to process.  The implementation of the function is extremely simple; it
-/// just gives you the locations of commands of type 'kNoOperationMarker'.
-void GetSegmentEnds(const NnetComputation &computation,
-                    std::vector<int32> *command_indexes);
+/// This utility function works out from a computation, the command-indexes of
+/// the commands of the given type.
+void GetCommandsOfType(const NnetComputation &computation,
+                       CommandType t,
+                       std::vector<int32> *command_indexes);
 
 /// This is a convenience interface for class ComputationChecker.  Call it with
 /// check_rewrite = true only if the computation is pre-optimization.
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 96e00fa1506..38e2559ac62 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -3301,8 +3301,7 @@ class ComputationLoopedOptimizer {
  private:
 
   // Figures out the time shift between the successive computation requests.
-  static int32 FindTimeShift(const NnetComputation &computation,
-                             const std::vector<int32> &segment_ends);
+  static int32 FindTimeShift(const NnetComputation &computation);
 
   // This function creates a mapping from a matrix-index > 0,
   // to a pair (unique_id, time_offset) that represents the debug-info
@@ -3440,7 +3439,7 @@ class ComputationLoopedOptimizer {
 
 
 
-  /// Given a list of command indexes ('segment_end_commands') which are
+  /// Given a list of command indexes ('splice_point_commands') which are
   /// expected to be command indexes of the kNoOperationMarker at segment
   /// boundaries, this function outputs for each of these command indexes a list
   /// of matrices which are 'active' at that point in time.  By 'active' we mean
@@ -3448,12 +3447,12 @@ class ComputationLoopedOptimizer {
   /// initialization with zeros as being written to); and will be read after
   /// that time.  These is the list of matrices that 'need to be in scope'
   /// at those points in time.  '*active_matrices' is indexed by the
-  /// same index as 'segment_end_commands', and is then a list of active
+  /// same index as 'splice_point_commands', and is then a list of active
   /// matrices, in numerical order of matrix index.
   /// Note: for each i, (*active_matrices)[i] will be sorted and unique.
   static void FindActiveMatrices(const NnetComputation &computation,
                                  const Analyzer &analyzer,
-                                 const std::vector<int32> &segment_end_commands,
+                                 const std::vector<int32> &splice_point_commands,
                                  std::vector<std::vector<int32> > *active_matrices);
 
 
@@ -3462,13 +3461,14 @@ class ComputationLoopedOptimizer {
   Analyzer analyzer_;
   std::vector<std::pair<int32, int32> > matrix_to_pair_;
 
-  std::vector<int32> segment_end_commands_;
+  std::vector<int32> splice_point_commands_;
 };
 
 // static
 int32 ComputationLoopedOptimizer::FindTimeShift(
-    const NnetComputation &computation,
-    const std::vector<int32> &segment_ends) {
+    const NnetComputation &computation) {
+  std::vector<int32> segment_ends;
+  GetCommandsOfType(computation, kNoOperationMarker, &segment_ends);
   KALDI_ASSERT(segment_ends.size() >= 3);
   // Ignore the first segment as it tends to be a special case
   // (it has more left context),
@@ -3647,19 +3647,9 @@ bool ComputationLoopedOptimizer::FindFirstRepeat(
   // differ.
   KALDI_ASSERT(num_segments >= 2);
 
-  bool perform_time_offset_check = true;
-  if (normalized_active_pairs.back().empty()) {
-    // If there are no variables active after the end of the last-but-one segment
-    // (which is the last element in segment_ends, since we remove the end of the
-    // very last segment), then don't perform the check related to
-    // time-offsets, it's not relevant.  [this would probably be a computation
-    // that doesn't require any context].
-    perform_time_offset_check = false;
-  }
   for (int32 s = 0; s < num_segments; s++) {
     for (int32 t = s + 1; t < num_segments; t++) {
-      if ((!perform_time_offset_check ||
-           time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
+      if ((time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) &&
           normalized_active_pairs[s] == normalized_active_pairs[t]) {
         *seg1 = s;
         *seg2 = t;
@@ -3696,16 +3686,16 @@ void ComputationLoopedOptimizer::PairListToMatrixList(
 void ComputationLoopedOptimizer::FindActiveMatrices(
     const NnetComputation &computation,
     const Analyzer &analyzer,
-    const std::vector<int32> &segment_end_commands,
+    const std::vector<int32> &splice_point_commands,
     std::vector<std::vector<int32> > *active_matrices) {
   int32 num_matrices = computation.matrices.size();
-  int32 num_segments = segment_end_commands.size();
+  int32 num_splice_points = splice_point_commands.size();
   active_matrices->clear();
-  active_matrices->resize(num_segments);
+  active_matrices->resize(num_splice_points);
   // this object just makes available some extra functions, vs. the Analyzer
   // object.
   ComputationAnalysis analysis(computation, analyzer);
-  KALDI_ASSERT(IsSortedAndUniq(segment_end_commands));
+  KALDI_ASSERT(IsSortedAndUniq(splice_point_commands));
 
   // the following vector gives us, for each matrix index, a submatrix index
   // that covers the whole of that matrix (needed by interface of 'analysis' object).
@@ -3713,18 +3703,18 @@ void ComputationLoopedOptimizer::FindActiveMatrices(
   computation.GetWholeSubmatrices(&whole_submatrices);
   for (int32 m = 1; m < num_matrices; m++) {
     // the following are command indexes, comparable with the indexes
-    // in 'segment_end_commands'.
+    // in 'splice_point_commands'.
     int32 s = whole_submatrices[m],  // submatrix consisting of the whole of
                                      // 'm'.
         first_access = analysis.FirstAccess(s),
         last_access = analysis.LastAccess(s);
-    for (int32 seg = 0; seg < num_segments; seg++) {
-      int32 segment_end = segment_end_commands[seg];
-      if (first_access < segment_end && last_access > segment_end) {
+    for (int32 i = 0; i < num_splice_points; i++) {
+      int32 splice_point = splice_point_commands[i];
+      if (first_access < splice_point && last_access > splice_point) {
         // If the block of time during which the matrix is accessed, includes
-        // this segment end-point, then the matrix is considered 'active' at
-        // that time.
-        (*active_matrices)[seg].push_back(m);
+        // this command index, then the matrix is considered 'active' at this
+        // time.
+        (*active_matrices)[i].push_back(m);
       }
     }
   }
@@ -3860,8 +3850,8 @@ void ComputationLoopedOptimizer::FormInfiniteLoop(
   KALDI_ASSERT(static_cast<int32>(computation->commands.size()) >=
                command2 + 1 && command1 < command2);
   KALDI_ASSERT(
-      computation->commands[command1].command_type == kNoOperationMarker &&
-      computation->commands[command2].command_type == kNoOperationMarker);
+      computation->commands[command1].command_type == kNoOperationPermanent &&
+      computation->commands[command2].command_type == kNoOperationPermanent);
   // Remove any commands after 'command2'.
   computation->commands.resize(command2 + 1);
   computation->commands[command2].command_type = kGotoLabel;
@@ -3880,25 +3870,22 @@ bool ComputationLoopedOptimizer::Optimize() {
                "You must request matrix debug info when compiling "
                "looped computations.");
 
-  // get the indexes of the separator commands at the ends of segments.
-  std::vector<int32> segment_ends;
-  GetSegmentEnds(*computation_, &segment_ends);
-  int32 time_shift_per_segment = FindTimeShift(*computation_,
-                                               segment_ends);
-
-  // Ignore the end of the very last segment- it is not a candidate for a
-  // 'splice point'.  What we're doing here is like creating a tape loop; we
-  // have to find a place where the list of variables is the same except for a
-  // time offset.
-  // [note: it's not exactly like a tape loop because the prologue can
-  // vary... the sequence is of the form like a b b b b b .. ]
-  segment_ends.pop_back();
+  // get the indexes of potential splice points, one per segment of the
+  // computation.  We locate the splice points where kNoOperationPermanent is.
+  // This is guaranteed to be after the inputs have been received, and before
+  // the bulk of the computation in the segment, and of course before we provide
+  // the output.  It happens that by choosing this as the splice point we avoid
+  // certain problems that would arise, for instance, if we chose the segment
+  // boundaries (kNoOperationMarker).
+  std::vector<int32> splice_points;
+  GetCommandsOfType(*computation_, kNoOperationPermanent,
+                    &splice_points);
+  int32 time_shift_per_segment = FindTimeShift(*computation_);
 
 
   std::vector<std::vector<int32> > active_matrices;
-  // Find the list of matrices active at each of those segment-end-command
-  // times.
-  FindActiveMatrices(*computation_, analyzer_, segment_ends,
+  // Find the list of matrices active at each of the potential splice points.
+  FindActiveMatrices(*computation_, analyzer_, splice_points,
                      &active_matrices);
 
   // Find a representation of the matrices of the computation as pairs
@@ -3912,7 +3899,7 @@ bool ComputationLoopedOptimizer::Optimize() {
   unordered_map<std::pair<int32, int32>, int32, PairHasher<int32> > pair_to_matrix;
   GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix);
 
-  // get lists of matrix per segment in the pair representation.
+  // get lists of matrix per splice-point in the pair representation.
   std::vector<std::vector<std::pair<int32, int32> > > pair_lists;
   ConvertListsToPairLists(active_matrices, matrix_to_pair,
                           &pair_lists);
@@ -3920,8 +3907,8 @@ bool ComputationLoopedOptimizer::Optimize() {
   std::vector<int32> time_offsets;
   NormalizePairLists(&pair_lists, &time_offsets);
 
-  // Note: seg1 and seg2 are indexes into 'segment_ends', representing
-  // points in time (that happen to be the ends of segments).
+  // Note: seg1 and seg2 are indexes into 'splice_points', representing
+  // potential splice points (located near the beginnings of segments).
   int32 seg1, seg2;
   if (!FindFirstRepeat(pair_lists,
                        time_offsets,
@@ -3945,7 +3932,7 @@ bool ComputationLoopedOptimizer::Optimize() {
                           time_difference);
 
 
-  FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_);
+  FormInfiniteLoop(splice_points[seg1], splice_points[seg2], computation_);
 
   AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_);
 
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index df23bc4ce84..71a97c22748 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -123,6 +123,8 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
 }
 
 // move commands that resize matrices to as late/early as possible.
+// (however, keep input and output commands where they were; it
+// creates other headaches if we move those).
 void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) {
   ComputationVariables variables;
   variables.Init(*computation);
@@ -163,15 +165,20 @@ void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) {
       }
       if (first_access_command != -1) {
         KALDI_ASSERT(first_access_command > ma.allocate_command);
-        // move the initialization command to just before the first access.
-        commands[ma.allocate_command].first = first_access_command * 3 - 1;
+        // move the initialization command to just before the first access,
+        // it wasn't a kAcceptInput command.
+        if (commands[ma.allocate_command].second->command_type != kAcceptInput)
+          commands[ma.allocate_command].first = first_access_command * 3 - 1;
       }
     }
     if (ma.deallocate_command != -1) {
       if (!ma.accesses.empty()) {
         int32 last_access_command = ma.accesses.back().command_index;
-        // move the destruction command to just after the last access.
-        commands[ma.deallocate_command].first = last_access_command * 3 + 1;
+        // move the destruction command to just after the last access, if it
+        // wasn't a kProvideOutput command.
+        if (commands[ma.deallocate_command].second->command_type !=
+            kProvideOutput)
+          commands[ma.deallocate_command].first = last_access_command * 3 + 1;
       }
     }
   }
@@ -437,7 +444,6 @@ void Optimize(const NnetOptimizeOptions &config,
   if (GetVerboseLevel() >= 4)
     CheckComputation(nnet, *computation, true);
 
-
   { // Call LimitDerivativeTimes(); it's important that this
     // should come before other optimizations (search for "insist" in
     // nnet-optimize-utils.cc for the reasons).
@@ -831,98 +837,9 @@ static void SplitComputationIntoSegments(
   segments->push_back(std::pair<int32, int32>(cur_start, num_commands));
 }
 
-// This is a helper function used in ConsolidateIoOperations().
-//
-// Suppose we had something like this before ConsolidateIoOperations() (as would
-// be printed by Print()
-
-//  c90: output m50 to user [for node: 'output']
-//  ...
-//  c100: [label for goto statement]
-//  c101: # computation segment separator [e.g., begin backward commands]
-//  ...
-//  c105: m62 = user input [for node: 'input']
-//  ...
-//  c190: output m79 to user [for node: 'output']
-//  ...
-//  c200: goto c100
-//
-//  this would get reordered to the following by ConsolidateIoOperations
-//  (the bulk of the code, before this function is called):
-//
-//  c99: [label for goto statement]
-//  c100: output m50 to user [for node: 'output']
-//  c101: # computation segment separator [e.g., begin backward commands]
-//  c102: m62 = user input [for node: 'input']
-//  ...
-//  c199: goto c199
-//  c200: output m79 to user [for node: 'output']
-//
-// Now command c200 is unreachable, but there is a similar command at c100
-// (after the goto) that will substitute.  However, the matrix indexes are different.
-// So we need to change the above so that the last two commands read:
-//  c199: m50.swap(m79}
-//  c200: goto c199
-void FixGotoOutputReordering(const Nnet &nnet,
-                             NnetComputation *computation) {
-  FixGotoLabel(computation);  // make sure the destination label of the goto statement was
-                              // correct.
-  int32 goto_command_index = -1;
-  for (int32 c = computation->commands.size() - 1; c >= 0; c--)
-    if (computation->commands[c].command_type == kGotoLabel)
-      goto_command_index = c;
-  KALDI_ASSERT(goto_command_index > 0);
-  int32 goto_label_index = computation->commands[goto_command_index].arg1;
-
-  std::vector<int32> output_commands_after_goto,
-      output_commands_after_label;
-  for (int32 c = goto_command_index + 1;
-       c < static_cast<int32>(computation->commands.size()); c++) {
-    KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput);
-    output_commands_after_goto.push_back(c);
-  }
-  for (int32 c = goto_label_index + 1;
-       c < goto_command_index; c++) {  // note: we break from this loop.
-    CommandType t = computation->commands[c].command_type;
-    if (t == kProvideOutput)
-      output_commands_after_label.push_back(c);
-    else if (t != kNoOperationMarker && t != kAcceptInput)
-      break;
-  }
-  if (output_commands_after_goto.size() != output_commands_after_label.size()) {
-    computation->Print(std::cerr, nnet);
-    KALDI_ERR << "Could not fix goto/output reordering, size mismatch.";
-  }
-  NnetComputation::Command goto_command = computation->commands[goto_command_index];
-  // be we'll be replacing the final kProvideOutput commands with
-  // kAllocMatrixFromOther [i.e. swap commands], and moving them one command
-  // backward; later we'll put the goto command at the end.
-  for (size_t i = 0; i < output_commands_after_goto.size(); i++) {
-    int32 c1 = output_commands_after_label[i],
-        c2 = output_commands_after_goto[i],
-        new_c2 = c2 - 1;
-    int32 s1 = computation->commands[c1].arg1,
-        s2 = computation->commands[c2].arg1;
-    // The following assert checks that the network node-index is the same...
-    // the idea is that the outputs should have been provided in the same order.
-    // I can think of no reason why the order might be different.
-    KALDI_ASSERT(computation->commands[c1].arg2 ==
-                 computation->commands[c1].arg2);
-    computation->commands[new_c2].command_type = kAllocMatrixFromOther;
-    computation->commands[new_c2].arg1 = s1;
-    computation->commands[new_c2].arg2 = s2;
-  }
-  // ... and move the goto command to the end.
-  computation->commands.back() = goto_command;
-}
-
 
 void ConsolidateIoOperations(const Nnet &nnet,
                              NnetComputation *computation) {
-  bool ends_with_goto =
-      (!computation->commands.empty() &&
-       computation->commands.back().command_type == kGotoLabel);
-
   // These segments, represented as (start-index, end-index),
   // are segments of the computation separated by kNoOperationMarker.
   std::vector<std::pair<int32, int32> > segments;
@@ -971,9 +888,6 @@ void ConsolidateIoOperations(const Nnet &nnet,
     KALDI_ASSERT(c == segment_end);
   }
   computation->commands.swap(reordered_commands);
-
-  if (ends_with_goto)
-    FixGotoOutputReordering(nnet, computation);
 }