From 5c9a539312ef02cc8b9969d592026a5a29956c25 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 10 May 2017 16:24:20 -0400 Subject: [PATCH] [src] Fix bug found by Hainan and Yiming regarding looped computation. --- src/nnet3/nnet-analyze.cc | 7 +- src/nnet3/nnet-analyze.h | 12 ++-- src/nnet3/nnet-optimize-utils.cc | 91 +++++++++++--------------- src/nnet3/nnet-optimize.cc | 108 ++++--------------------------- 4 files changed, 59 insertions(+), 159 deletions(-) diff --git a/src/nnet3/nnet-analyze.cc b/src/nnet3/nnet-analyze.cc index 98e7e7724c6..9d6dba371a7 100644 --- a/src/nnet3/nnet-analyze.cc +++ b/src/nnet3/nnet-analyze.cc @@ -1280,12 +1280,13 @@ void Analyzer::Init(const Nnet &nnet, const NnetComputation &computation) { &matrix_accesses); } -void GetSegmentEnds(const NnetComputation &computation, - std::vector *command_indexes) { +void GetCommandsOfType(const NnetComputation &computation, + CommandType t, + std::vector *command_indexes) { int32 num_commands = computation.commands.size(); command_indexes->clear(); for (int32 c = 0; c < num_commands; c++) - if (computation.commands[c].command_type == kNoOperationMarker) + if (computation.commands[c].command_type == t) command_indexes->push_back(c); } diff --git a/src/nnet3/nnet-analyze.h b/src/nnet3/nnet-analyze.h index 2b71a8c2411..532648dd5f2 100644 --- a/src/nnet3/nnet-analyze.h +++ b/src/nnet3/nnet-analyze.h @@ -423,13 +423,11 @@ class ComputationChecker { }; -/// This utility function works out from a computation, the locations of the -/// 'segment ends'. This is useful for online compilation, where the -/// computation has multiple segments corresponding to new pieces of input data -/// to process. The implementation of the function is extremely simple; it -/// just gives you the locations of commands of type 'kNoOperationMarker'. -void GetSegmentEnds(const NnetComputation &computation, - std::vector *command_indexes); +/// This utility function works out from a computation, the command-indexes of +/// the commands of the given type. +void GetCommandsOfType(const NnetComputation &computation, + CommandType t, + std::vector *command_indexes); /// This is a convenience interface for class ComputationChecker. Call it with /// check_rewrite = true only if the computation is pre-optimization. diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 96e00fa1506..38e2559ac62 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -3301,8 +3301,7 @@ class ComputationLoopedOptimizer { private: // Figures out the time shift between the successive computation requests. - static int32 FindTimeShift(const NnetComputation &computation, - const std::vector &segment_ends); + static int32 FindTimeShift(const NnetComputation &computation); // This function creates a mapping from a matrix-index > 0, // to a pair (unique_id, time_offset) that represents the debug-info @@ -3440,7 +3439,7 @@ class ComputationLoopedOptimizer { - /// Given a list of command indexes ('segment_end_commands') which are + /// Given a list of command indexes ('splice_point_commands') which are /// expected to be command indexes of the kNoOperationMarker at segment /// boundaries, this function outputs for each of these command indexes a list /// of matrices which are 'active' at that point in time. By 'active' we mean @@ -3448,12 +3447,12 @@ class ComputationLoopedOptimizer { /// initialization with zeros as being written to); and will be read after /// that time. These is the list of matrices that 'need to be in scope' /// at those points in time. '*active_matrices' is indexed by the - /// same index as 'segment_end_commands', and is then a list of active + /// same index as 'splice_point_commands', and is then a list of active /// matrices, in numerical order of matrix index. /// Note: for each i, (*active_matrices)[i] will be sorted and unique. static void FindActiveMatrices(const NnetComputation &computation, const Analyzer &analyzer, - const std::vector &segment_end_commands, + const std::vector &splice_point_commands, std::vector > *active_matrices); @@ -3462,13 +3461,14 @@ class ComputationLoopedOptimizer { Analyzer analyzer_; std::vector > matrix_to_pair_; - std::vector segment_end_commands_; + std::vector splice_point_commands_; }; // static int32 ComputationLoopedOptimizer::FindTimeShift( - const NnetComputation &computation, - const std::vector &segment_ends) { + const NnetComputation &computation) { + std::vector segment_ends; + GetCommandsOfType(computation, kNoOperationMarker, &segment_ends); KALDI_ASSERT(segment_ends.size() >= 3); // Ignore the first segment as it tends to be a special case // (it has more left context), @@ -3647,19 +3647,9 @@ bool ComputationLoopedOptimizer::FindFirstRepeat( // differ. KALDI_ASSERT(num_segments >= 2); - bool perform_time_offset_check = true; - if (normalized_active_pairs.back().empty()) { - // If there are no variables active after the end of the last-but-one segment - // (which is the last element in segment_ends, since we remove the end of the - // very last segment), then don't perform the check related to - // time-offsets, it's not relevant. [this would probably be a computation - // that doesn't require any context]. - perform_time_offset_check = false; - } for (int32 s = 0; s < num_segments; s++) { for (int32 t = s + 1; t < num_segments; t++) { - if ((!perform_time_offset_check || - time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) && + if ((time_offsets[t]-time_offsets[s] == (t-s) * time_shift_per_segment) && normalized_active_pairs[s] == normalized_active_pairs[t]) { *seg1 = s; *seg2 = t; @@ -3696,16 +3686,16 @@ void ComputationLoopedOptimizer::PairListToMatrixList( void ComputationLoopedOptimizer::FindActiveMatrices( const NnetComputation &computation, const Analyzer &analyzer, - const std::vector &segment_end_commands, + const std::vector &splice_point_commands, std::vector > *active_matrices) { int32 num_matrices = computation.matrices.size(); - int32 num_segments = segment_end_commands.size(); + int32 num_splice_points = splice_point_commands.size(); active_matrices->clear(); - active_matrices->resize(num_segments); + active_matrices->resize(num_splice_points); // this object just makes available some extra functions, vs. the Analyzer // object. ComputationAnalysis analysis(computation, analyzer); - KALDI_ASSERT(IsSortedAndUniq(segment_end_commands)); + KALDI_ASSERT(IsSortedAndUniq(splice_point_commands)); // the following vector gives us, for each matrix index, a submatrix index // that covers the whole of that matrix (needed by interface of 'analysis' object). @@ -3713,18 +3703,18 @@ void ComputationLoopedOptimizer::FindActiveMatrices( computation.GetWholeSubmatrices(&whole_submatrices); for (int32 m = 1; m < num_matrices; m++) { // the following are command indexes, comparable with the indexes - // in 'segment_end_commands'. + // in 'splice_point_commands'. int32 s = whole_submatrices[m], // submatrix consisting of the whole of // 'm'. first_access = analysis.FirstAccess(s), last_access = analysis.LastAccess(s); - for (int32 seg = 0; seg < num_segments; seg++) { - int32 segment_end = segment_end_commands[seg]; - if (first_access < segment_end && last_access > segment_end) { + for (int32 i = 0; i < num_splice_points; i++) { + int32 splice_point = splice_point_commands[i]; + if (first_access < splice_point && last_access > splice_point) { // If the block of time during which the matrix is accessed, includes - // this segment end-point, then the matrix is considered 'active' at - // that time. - (*active_matrices)[seg].push_back(m); + // this command index, then the matrix is considered 'active' at this + // time. + (*active_matrices)[i].push_back(m); } } } @@ -3860,8 +3850,8 @@ void ComputationLoopedOptimizer::FormInfiniteLoop( KALDI_ASSERT(static_cast(computation->commands.size()) >= command2 + 1 && command1 < command2); KALDI_ASSERT( - computation->commands[command1].command_type == kNoOperationMarker && - computation->commands[command2].command_type == kNoOperationMarker); + computation->commands[command1].command_type == kNoOperationPermanent && + computation->commands[command2].command_type == kNoOperationPermanent); // Remove any commands after 'command2'. computation->commands.resize(command2 + 1); computation->commands[command2].command_type = kGotoLabel; @@ -3880,25 +3870,22 @@ bool ComputationLoopedOptimizer::Optimize() { "You must request matrix debug info when compiling " "looped computations."); - // get the indexes of the separator commands at the ends of segments. - std::vector segment_ends; - GetSegmentEnds(*computation_, &segment_ends); - int32 time_shift_per_segment = FindTimeShift(*computation_, - segment_ends); - - // Ignore the end of the very last segment- it is not a candidate for a - // 'splice point'. What we're doing here is like creating a tape loop; we - // have to find a place where the list of variables is the same except for a - // time offset. - // [note: it's not exactly like a tape loop because the prologue can - // vary... the sequence is of the form like a b b b b b .. ] - segment_ends.pop_back(); + // get the indexes of potential splice points, one per segment of the + // computation. We locate the splice points where kNoOperationPermanent is. + // This is guaranteed to be after the inputs have been received, and before + // the bulk of the computation in the segment, and of course before we provide + // the output. It happens that by choosing this as the splice point we avoid + // certain problems that would arise, for instance, if we chose the segment + // boundaries (kNoOperationMarker). + std::vector splice_points; + GetCommandsOfType(*computation_, kNoOperationPermanent, + &splice_points); + int32 time_shift_per_segment = FindTimeShift(*computation_); std::vector > active_matrices; - // Find the list of matrices active at each of those segment-end-command - // times. - FindActiveMatrices(*computation_, analyzer_, segment_ends, + // Find the list of matrices active at each of the potential splice points. + FindActiveMatrices(*computation_, analyzer_, splice_points, &active_matrices); // Find a representation of the matrices of the computation as pairs @@ -3912,7 +3899,7 @@ bool ComputationLoopedOptimizer::Optimize() { unordered_map, int32, PairHasher > pair_to_matrix; GetPairToMatrixMap(matrix_to_pair, &pair_to_matrix); - // get lists of matrix per segment in the pair representation. + // get lists of matrix per splice-point in the pair representation. std::vector > > pair_lists; ConvertListsToPairLists(active_matrices, matrix_to_pair, &pair_lists); @@ -3920,8 +3907,8 @@ bool ComputationLoopedOptimizer::Optimize() { std::vector time_offsets; NormalizePairLists(&pair_lists, &time_offsets); - // Note: seg1 and seg2 are indexes into 'segment_ends', representing - // points in time (that happen to be the ends of segments). + // Note: seg1 and seg2 are indexes into 'splice_points', representing + // potential splice points (located near the beginnings of segments). int32 seg1, seg2; if (!FindFirstRepeat(pair_lists, time_offsets, @@ -3945,7 +3932,7 @@ bool ComputationLoopedOptimizer::Optimize() { time_difference); - FormInfiniteLoop(segment_ends[seg1], segment_ends[seg2], computation_); + FormInfiniteLoop(splice_points[seg1], splice_points[seg2], computation_); AddMatrixSwapCommands(seg1_matrices, seg2_matrices, computation_); diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index df23bc4ce84..71a97c22748 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -123,6 +123,8 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const { } // move commands that resize matrices to as late/early as possible. +// (however, keep input and output commands where they were; it +// creates other headaches if we move those). void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) { ComputationVariables variables; variables.Init(*computation); @@ -163,15 +165,20 @@ void MoveSizingCommands(const Nnet &nnet, NnetComputation *computation) { } if (first_access_command != -1) { KALDI_ASSERT(first_access_command > ma.allocate_command); - // move the initialization command to just before the first access. - commands[ma.allocate_command].first = first_access_command * 3 - 1; + // move the initialization command to just before the first access, + // it wasn't a kAcceptInput command. + if (commands[ma.allocate_command].second->command_type != kAcceptInput) + commands[ma.allocate_command].first = first_access_command * 3 - 1; } } if (ma.deallocate_command != -1) { if (!ma.accesses.empty()) { int32 last_access_command = ma.accesses.back().command_index; - // move the destruction command to just after the last access. - commands[ma.deallocate_command].first = last_access_command * 3 + 1; + // move the destruction command to just after the last access, if it + // wasn't a kProvideOutput command. + if (commands[ma.deallocate_command].second->command_type != + kProvideOutput) + commands[ma.deallocate_command].first = last_access_command * 3 + 1; } } } @@ -437,7 +444,6 @@ void Optimize(const NnetOptimizeOptions &config, if (GetVerboseLevel() >= 4) CheckComputation(nnet, *computation, true); - { // Call LimitDerivativeTimes(); it's important that this // should come before other optimizations (search for "insist" in // nnet-optimize-utils.cc for the reasons). @@ -831,98 +837,9 @@ static void SplitComputationIntoSegments( segments->push_back(std::pair(cur_start, num_commands)); } -// This is a helper function used in ConsolidateIoOperations(). -// -// Suppose we had something like this before ConsolidateIoOperations() (as would -// be printed by Print() - -// c90: output m50 to user [for node: 'output'] -// ... -// c100: [label for goto statement] -// c101: # computation segment separator [e.g., begin backward commands] -// ... -// c105: m62 = user input [for node: 'input'] -// ... -// c190: output m79 to user [for node: 'output'] -// ... -// c200: goto c100 -// -// this would get reordered to the following by ConsolidateIoOperations -// (the bulk of the code, before this function is called): -// -// c99: [label for goto statement] -// c100: output m50 to user [for node: 'output'] -// c101: # computation segment separator [e.g., begin backward commands] -// c102: m62 = user input [for node: 'input'] -// ... -// c199: goto c199 -// c200: output m79 to user [for node: 'output'] -// -// Now command c200 is unreachable, but there is a similar command at c100 -// (after the goto) that will substitute. However, the matrix indexes are different. -// So we need to change the above so that the last two commands read: -// c199: m50.swap(m79} -// c200: goto c199 -void FixGotoOutputReordering(const Nnet &nnet, - NnetComputation *computation) { - FixGotoLabel(computation); // make sure the destination label of the goto statement was - // correct. - int32 goto_command_index = -1; - for (int32 c = computation->commands.size() - 1; c >= 0; c--) - if (computation->commands[c].command_type == kGotoLabel) - goto_command_index = c; - KALDI_ASSERT(goto_command_index > 0); - int32 goto_label_index = computation->commands[goto_command_index].arg1; - - std::vector output_commands_after_goto, - output_commands_after_label; - for (int32 c = goto_command_index + 1; - c < static_cast(computation->commands.size()); c++) { - KALDI_ASSERT(computation->commands[c].command_type == kProvideOutput); - output_commands_after_goto.push_back(c); - } - for (int32 c = goto_label_index + 1; - c < goto_command_index; c++) { // note: we break from this loop. - CommandType t = computation->commands[c].command_type; - if (t == kProvideOutput) - output_commands_after_label.push_back(c); - else if (t != kNoOperationMarker && t != kAcceptInput) - break; - } - if (output_commands_after_goto.size() != output_commands_after_label.size()) { - computation->Print(std::cerr, nnet); - KALDI_ERR << "Could not fix goto/output reordering, size mismatch."; - } - NnetComputation::Command goto_command = computation->commands[goto_command_index]; - // be we'll be replacing the final kProvideOutput commands with - // kAllocMatrixFromOther [i.e. swap commands], and moving them one command - // backward; later we'll put the goto command at the end. - for (size_t i = 0; i < output_commands_after_goto.size(); i++) { - int32 c1 = output_commands_after_label[i], - c2 = output_commands_after_goto[i], - new_c2 = c2 - 1; - int32 s1 = computation->commands[c1].arg1, - s2 = computation->commands[c2].arg1; - // The following assert checks that the network node-index is the same... - // the idea is that the outputs should have been provided in the same order. - // I can think of no reason why the order might be different. - KALDI_ASSERT(computation->commands[c1].arg2 == - computation->commands[c1].arg2); - computation->commands[new_c2].command_type = kAllocMatrixFromOther; - computation->commands[new_c2].arg1 = s1; - computation->commands[new_c2].arg2 = s2; - } - // ... and move the goto command to the end. - computation->commands.back() = goto_command; -} - void ConsolidateIoOperations(const Nnet &nnet, NnetComputation *computation) { - bool ends_with_goto = - (!computation->commands.empty() && - computation->commands.back().command_type == kGotoLabel); - // These segments, represented as (start-index, end-index), // are segments of the computation separated by kNoOperationMarker. std::vector > segments; @@ -971,9 +888,6 @@ void ConsolidateIoOperations(const Nnet &nnet, KALDI_ASSERT(c == segment_end); } computation->commands.swap(reordered_commands); - - if (ends_with_goto) - FixGotoOutputReordering(nnet, computation); }