From 5fe8b138ad97cfa1f6cdac1d5b64c8ebbef9205e Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 6 Feb 2018 20:28:58 -0500 Subject: [PATCH 01/10] [src] Add a nnet3 optimization that tries to replace commands ending in Multi with other commands. --- src/nnet3/nnet-computation.h | 13 +- src/nnet3/nnet-optimize-utils.cc | 322 +++++++++++++++++++++++++++++++ src/nnet3/nnet-optimize-utils.h | 41 ++-- src/nnet3/nnet-optimize.cc | 14 +- src/nnet3/nnet-optimize.h | 6 + src/nnet3/nnet-utils.cc | 12 -- 6 files changed, 379 insertions(+), 29 deletions(-) diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index 0c6c690684a..64b1f4df9aa 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -395,12 +395,17 @@ struct NnetComputation { // These are owned here. std::vector component_precomputed_indexes; - // used in kAddRows, kAddToRows, kCopyRows, kCopyToRows. contains row-indexes. + // Used in commands kAddRows, kAddToRows, kCopyRows, which + // contain indexes into this data-member. + // Each vector is a vector of row-indexes (with -1 usually treated as + // a special case meaning "don't do anything for this row" for add + // commands, or "use zero" for copy commands. std::vector > indexes; - // used kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti. - // contains pairs (sub-matrix index, row index)- or (-1,-1) meaning don't - // do anything for this row. + // Used in commands kAddRowsMulti, kAddToRowsMulti, kCopyRowsMulti and + // kCopyToRowsMulti. Contains pairs (sub-matrix index, row index)- or the + // special pair (-1,-1) meaning "don't do anything for this row" for add + // commands, or "use zero" for copy commands. std::vector > > indexes_multi; diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index c53fba815fb..756ea45e894 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -2576,6 +2576,328 @@ bool SnipRowOps(NnetComputation *computation) { +// This class implements the internals of the function SplitRowOps() which is +// declared in nnet-optimize-utils.h. +class RowOpsSplitter { + public: + RowOpsSplitter(NnetComputation *computation): computation_(computation) { } + + // Attempts to perform the optimization. Returns true if it made any change + // to the computation. + bool Split() { + return SplitIndexes() && SplitCommands(); + } + + private: + + // This function sets up split_info_, which describes how we can split up + // the vectors that are elements of computation_->indexes_multi. + // It will return true if it successfully split at least one of those + // vectors, and false otherwise. + bool SplitIndexes(); + + // This function modifies the commands in the computation. It returns + // true if it made any change. + bool SplitCommands(); + + + // This function attempts to optimize the command in + // computation_->commands[command_index]. It returns true if it made any + // change. If we are going to have to insert an extra command into the + // computation, this function will append an element to new_commands_. + bool SplitCommand(int32 command_index); + + // Below, define a multi-index as an element of NnetComputation::indexes_multi, + // for example, + // const std::vector > &multi_index = computation_->indexes_multi[1]; + // It is a list of pairs. + + // This struct appears as an element of the list inside MultiIndexSplitInfo. + // It helps us describe how we can split up a multi-index (a list of pairs) + // into a sequence of ranges where the .first value is constant across the + // range. + struct SingleSplitInfo { + // 'offset' is the index into the vector of pairs that forms the + // start of this range. In the example where we are splitting up + // ((10,2), (10,3), (10,4), (15,3), (15,5), (15,7)) + // there would be two instances of struct SingleSplitInfo, with + // offset = 0 and offset = 3. + int32 offset; + // 'size' is the number of pairs in this range; in the example + // above, both 'size' elements would be 3. + int32 size; + // first_value is the value of the .first index throughout this range; in + // the example above, it would be 10 and 15 respectively. It represents a + // submatrix index. + int32 first_value; + + // initial_second_value is the minimum value of .second for any element in + // this range: it would be 2 and 3 respectively in the example above. + int32 min_second_value; + + // second_value_range is the highest value of .second for any element in + // this range, plus one, minus min_second_value. (It's the number of rows + // in the other submatrix of the operation). + int32 second_value_range; + + // If the .second values in the range are consecutive then + // 'second_value_offsets' will be empty. Otherwise it will + // be a vector of size 'size', containing numbers in the + // range 0 ... second_value_range - 1, such that + // min_second_value + second_value_offsets[i] gives + // the .second value at the corresponding position in the range. + // In the second range of the example above, the range + // consisting of ((15,3), (15,5), (15,7)), 'second_value_offsets + // would be the vector (0, 2, 4). + std::vector second_value_offsets; + }; + + // An instance of the struct MultiIndexSplitInfo will be created for each multi-index, + // i.e. for each element of NnetComputation::indexes_multi. + struct MultiIndexSplitInfo { + // If we can split this multi-index into at most two ranges, this + // vector will be nonempty; otherwise it will be empty. + std::vector splits; + }; + + // GetSplitInfo() attempts to take a range of a + // std::vector >, as represented by begin and end + // iterators, and to extract its information into an object of type + // SingleSplitInfo. (all except for the .offset member, which will have + // been set by calling code). + // It return true if successful, and false otherwise. The only reasons that + // it might return false are that the range contains -1's or does not contain + // all-identical .first members). + bool GetSplitInfo(std::vector >::const_iterator begin, + std::vector >::const_iterator end, + SingleSplitInfo *info); + + // computation_ is the computation that we are modifying. + NnetComputation *computation_; + // split_info_ will contain information about how we can split up the members + // of computation_->indexes_multi into ranges. + std::vector split_info_; + // The following is a list of additional commands that we are going to insert + // into computation_, of the form (command-index, command) where command-index + // is a command index just before which we will insert the new command. + // (this is the format accepted by the function InsertCommands()). + std::vector > new_commands_; + +}; + + +bool RowOpsSplitter::GetSplitInfo( + std::vector >::const_iterator begin, + std::vector >::const_iterator end, + SingleSplitInfo *info) { + // max_size_ratio must be > 1.0, and could in principle be a float. It is + // there to prevent us from making changes to the computation which would end + // up wastefully launching too many kernels that would do nothing. + const int32 max_size_ratio = 2; + + int32 size = end - begin; + KALDI_ASSERT(size != 0); + int32 first = begin->first; + if (first < 0) + return false; + info->size = size; + info->first_value = first; + int32 initial_second_value = begin->second, + min_second_value = initial_second_value, + max_second_value = initial_second_value; + info->second_value_offsets.resize(size); + bool is_consecutive = true; + for (int32 i = 0; i < size; i++) { + int32 second = begin[i].second; + if (begin[i].first != first || second < 0) return false; + info->second_value_offsets[i] = second; + if (second != initial_second_value + i) + is_consecutive = false; + if (second < min_second_value) min_second_value = second; + if (second > max_second_value) max_second_value = second; + } + info->min_second_value = min_second_value; + info->second_value_range = max_second_value + 1 - min_second_value; + if (info->second_value_range > size * max_size_ratio) + return false; + if (is_consecutive) { + info->second_value_offsets.clear(); + } else { + for (int32 i = 0; i < size; i++) + info->second_value_offsets[i] -= min_second_value; + } + return true; +} + + +bool RowOpsSplitter::SplitIndexes() { + bool ans = false; + int32 num_indexes_multi = computation_->indexes_multi.size(); + split_info_.resize(num_indexes_multi); + for (int32 i = 0; i < num_indexes_multi; i++) { + const std::vector > &multi_index = + computation_->indexes_multi[i]; + MultiIndexSplitInfo &split_info = split_info_[i]; + + int32 num_pairs = multi_index.size(); + KALDI_ASSERT(num_pairs > 0); + // 'split_point' will be set to the first index j for which + // multi_index[j-1].first != multi_index[j].first, or -1 + // if no such j exists. + int32 split_point = -1, initial_first = multi_index[0].first; + for (int32 j = 1; j < num_pairs; j++) { + if (multi_index[j].first != initial_first) { + split_point = j; + break; + } + } + if (split_point == -1) { + split_info.splits.resize(1); + split_info.splits[0].offset = 0; + if (!GetSplitInfo(multi_index.begin(), multi_index.end(), + &(split_info.splits[0]))) { + split_info.splits.clear(); + } else { + ans = true; + } + } else { + split_info.splits.resize(2); + split_info.splits[0].offset = 0; + split_info.splits[1].offset = split_point; + + std::vector >::const_iterator mid_iter = + multi_index.begin() + split_point; + if (!GetSplitInfo(multi_index.begin(), mid_iter, + &(split_info.splits[0])) || + !GetSplitInfo(mid_iter, multi_index.end(), + &(split_info.splits[1]))) { + split_info.splits.clear(); + } else { + ans = true; + } + } + } + return ans; +} + +bool RowOpsSplitter::SplitCommand(int32 c) { + NnetComputation::Command &command = computation_->commands[c]; + CommandType command_type = command.command_type; + // For commands that are not of the following four types, return false: we + // won't be changing these commands. + switch (command_type) { + case kAddRowsMulti: case kCopyRowsMulti: + case kAddToRowsMulti: case kCopyToRowsMulti: break; + default: return false; + } + int32 indexes_multi_index = command.arg2; + KALDI_ASSERT(indexes_multi_index < + static_cast(split_info_.size())); + const MultiIndexSplitInfo &split_info = split_info_[indexes_multi_index]; + if (split_info.splits.empty()) + return false; // these indexes couldn't be split: e.g. they contained more + // than two distinct .first elements, or there were other + // reasons. + + // we'll be splitting the command into either one or two pieces. + std::vector split_commands( + split_info.splits.size()); + for (size_t i = 0; i < split_info.splits.size(); i++) { + const SingleSplitInfo &split = split_info.splits[i]; + NnetComputation::Command &command_out = split_commands[i]; + command_out.alpha = command.alpha; + command_out.arg1 = computation_->NewSubMatrix( + command.arg1, split.offset, split.size, 0, -1); + command_out.arg2 = computation_->NewSubMatrix( + split.first_value, split.min_second_value, + split.second_value_range, 0, -1); + + if (split.second_value_offsets.empty()) { + // The .second elements are consecutive. + switch (command_type) { + case kAddRowsMulti: + command_out.command_type = kMatrixAdd; + break; + case kCopyRowsMulti: + command_out.command_type = kMatrixCopy; + break; + case kAddToRowsMulti: + command_out.command_type = kMatrixAdd; + std::swap(command_out.arg1, command_out.arg2); + break; + case kCopyToRowsMulti: + command_out.command_type = kMatrixCopy; + std::swap(command_out.arg1, command_out.arg2); + break; + default: // will never be reached. + break; + } + } else { + // Indexes are not consecutive: it needs to be a kAddRows or kCopyRows + // command. + command_out.arg3 = computation_->indexes.size(); + switch (command_type) { + case kAddRowsMulti: case kCopyRowsMulti: { + command_out.command_type = (command_type == kAddRowsMulti ? + kAddRows : kCopyRows); + computation_->indexes.push_back(split.second_value_offsets); + break; + } + case kCopyToRowsMulti: { + // We can't operate on this command because of what would happen + // with values of 'indexes' (see the variable in the block for + // kAddToRowsMulti) which were -1. Rows of the output would be + // set to zero, which is not the behavior we want here; we'd want + // them to be unaffected. + return false; + } + case kAddToRowsMulti: { + command_out.command_type = kAddRows; + std::swap(command_out.arg1, command_out.arg2); + // invert the indexes. + std::vector indexes(split.second_value_range, -1); + for (int32 i = 0; i < split.size; i++) { + // the following assert should always succeed because the + // AddToRowsMulti and CopyToRowsMulti should never have + // duplicate destinations in their indexes. + KALDI_ASSERT(indexes[split.second_value_offsets[i]] >= 0); + indexes[split.second_value_offsets[i]] = i; + } + computation_->indexes.push_back(indexes); + break; + } + default: + KALDI_ERR << "Code error: un-handled case."; + } + } + } + command = split_commands[0]; + // note: for now, split_commands.size() will be 1 or 2. + for (size_t i = 1; i < split_commands.size(); i++) { + new_commands_.resize(new_commands_.size() + 1); + // we'll want to insert this command right after command c, + // which is the same as just before command c + 1. + new_commands_.back().first = c + 1; + new_commands_.back().second = split_commands[i]; + } + return true; // We made a change. +} + +bool RowOpsSplitter::SplitCommands() { + bool ans = false; + int32 num_commands = computation_->commands.size(); + for (int32 c = 0; c < num_commands; c++) + if (SplitCommand(c)) + ans = true; + if (!new_commands_.empty()) + InsertCommands(&new_commands_, computation_); + return ans; +} + +bool SplitRowOps(NnetComputation *computation) { + RowOpsSplitter splitter(computation); + return splitter.Split(); +} /* diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 703f43af095..32adf9e3e19 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -455,6 +455,23 @@ bool ReplaceRowWithMatrixOps(NnetComputation *computation); /// computation->indexes. bool SnipRowOps(NnetComputation *computation); + +/// This function detects cases where commands of type kAddRowsMulti, +/// kAddToRowsMulti, kCopyRowsMulti, kCopyToRowsMulti use indexes that +/// correspond to at most two submatrices, in two distinct ranges without gaps +/// filled by -1's, and could be converted to at most two commands of type +/// kMatrixAdd, kMatrixCopy, kAddRows or kCopyRows. (Note: it's important that +/// this optimization takes place after SnipRowOps, because it doesn't remove +/// the -1's from the edges of the indexes, it relies on that operation doing +/// so). The "without-gaps" stipulation is just for convenience of +/// implementation, to have fewer cases to worry about. +/// +/// This function returns true if it made any changes to the computation; if it +/// returns true, then after calling this you should at some point do +/// RenumberComputation(), which will remove any now-unused members of +/// computation->indexes. +bool SplitRowOps(NnetComputation *computation); + /// This function detects submatrices and matrices that are never used (e.g. due /// to changes made in other optimization code), and members of indexes, /// indexes_multi and indexes_ranges that are unused or are duplicates, and memo @@ -535,18 +552,18 @@ void IdentifyIndexesRangesArgs(std::vector *commands, std::vector *indexes_ranges_args); /// Inserts commands into the computation at the requested places. 'commands' -/// is a list of pairs (command-index, command) that is expected to be sorted -/// on command-index. For each entry (c, command) in 'commands', 'command' is -/// inserted into 'computation' just *before* the command that (at entry) is in -/// computation->commands[c]. If there are multiple pairs with the same index -/// c, they will remain in the same order in which they were present in -/// 'commands'; however, 'commands' does not have to be sorted on 'c'. -/// As a special case, if c == computation->commands.size(), the -/// corresponding commands are inserted at the beginning of the computation. -/// This function will appropriately renumber the argument of the kGotoLabel -/// command of any 'looped' computation. Command indexes c in commands[*].first -/// must be in the range [0, computation->commands.size()]. -/// This function may modify 'commands' by sorting it. +/// is a list of pairs (command-index, command) that is expected to be sorted on +/// command-index. For each entry (c, command) in 'commands', 'command' is +/// inserted into 'computation' just *before* the command that (at entry) is in +/// computation->commands[c]. If there are multiple pairs with the same index +/// c, they will remain in the same order in which they were present in +/// 'commands'; however, 'commands' does not have to be sorted on 'c'. As a +/// special case, if c == computation->commands.size(), the corresponding +/// commands are inserted at the beginning of the computation. This function +/// will appropriately renumber the argument of the kGotoLabel command of any +/// 'looped' computation. Command indexes c in commands[*].first must be in the +/// range [0, computation->commands.size()]. This function may modify +/// 'commands' by sorting it. void InsertCommands( std::vector > *commands, NnetComputation *computation); diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index d614afce7d0..ecce196801b 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -41,6 +41,14 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) { if (tok == "") { ReadBasicType(is, binary, &optimize_row_ops); ReadToken(is, binary, &tok); + } else { + optimize_row_ops = true; + } + if (tok == "") { + ReadBasicType(is, binary, &split_row_ops); + ReadToken(is, binary, &tok); + } else { + split_row_ops = true; } KALDI_ASSERT(tok == ""); ReadBasicType(is, binary, &convert_addition); @@ -516,12 +524,16 @@ void Optimize(const NnetOptimizeOptions &config, } - if (config.optimize && (config.snip_row_ops || config.optimize_row_ops)) { + if (config.optimize && (config.snip_row_ops || config.optimize_row_ops || + config.split_row_ops)) { bool must_renumber = false; if (config.snip_row_ops && SnipRowOps(computation)) must_renumber = true; + if (config.split_row_ops && SplitRowOps(computation)) + must_renumber = true; if (config.optimize_row_ops && ReplaceRowWithMatrixOps(computation)) must_renumber = true; + if (must_renumber) { RenumberComputation(computation); if (GetVerboseLevel() >= 3) diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index 31872e46b72..a07c5490c5c 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -39,6 +39,7 @@ struct NnetOptimizeOptions { bool propagate_in_place; bool backprop_in_place; bool optimize_row_ops; + bool split_row_ops; bool extend_matrices; bool convert_addition; bool remove_assignments; @@ -63,6 +64,7 @@ struct NnetOptimizeOptions { propagate_in_place(true), backprop_in_place(true), optimize_row_ops(true), + split_row_ops(true), extend_matrices(true), convert_addition(true), remove_assignments(true), @@ -95,6 +97,10 @@ struct NnetOptimizeOptions { opts->Register("optimize-row-ops", &optimize_row_ops, "Set to false to " "disable certain optimizations that act on operations of " "type *Row*."); + opts->Register("split-row-ops", &split_row_ops, "Set to false to disable " + "an optimization that may replace some operations of type " + "kCopyRowsMulti or kAddRowsMulti with up to two simpler " + "operations."); opts->Register("convert-addition", &convert_addition, "Set to false to " "disable the optimization that converts Add commands into " "Copy commands wherever possible."); diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index 59885cf70b2..b0c81aae2b8 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -1805,19 +1805,7 @@ class ModelCollapser { void CollapseModel(const CollapseModelConfig &config, Nnet *nnet) { ModelCollapser c(config, nnet); - std::string info_before_collapse; - if (GetVerboseLevel() >= 4) - info_before_collapse = nnet->Info(); c.Collapse(); - if (GetVerboseLevel() >= 4) { - std::string info_after_collapse = nnet->Info(); - if (info_after_collapse != info_before_collapse) { - KALDI_VLOG(4) << "Collapsing model: info before collapse was: " - << info_before_collapse - << ", info after collapse was:" - << info_after_collapse; - } - } } bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, From 458dee84826fa1f6a7c3c352ba5a40d850466908 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 11 Feb 2018 02:15:11 -0500 Subject: [PATCH 02/10] [scripts] Add dropout-per-dim option --- .../steps/libs/nnet3/xconfig/basic_layers.py | 33 ++++++++++++++++--- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 47a5dfdf082..a3dfa89cf0e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -686,7 +686,9 @@ def set_default_configs(self): 'ng-linear-options': '', # only affects bottleneck layers. 'dropout-proportion': 0.5, # dropout-proportion only # affects layers with - # 'dropout' in the name. + # 'dropout' in the name + 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout + # mask is shared across time. 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -862,10 +864,31 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim)) elif nonlinearity == 'dropout': - line = ('component name={0}.{1} type=DropoutComponent ' - 'dim={2} dropout-proportion={3}'.format( - self.name, nonlinearity, output_dim, - self.config['dropout-proportion'])) + if not self.config['dropout-per-dim']: + line = ('component name={0}.{1} type=DropoutComponent ' + 'dim={2} dropout-proportion={3}'.format( + self.name, nonlinearity, output_dim, + self.config['dropout-proportion'])) + else: + line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' + 'output-dim={1} dropout-proportion={2}'.format( + self.name, output_dim, self.config['dropout-proportion'])) + configs.append(line) + # note: the input to the dropout_mask component is never used, it's + # just syntactically required. + line = ('component-node name={0}.dropout_mask component={0}.dropout_mask ' + 'input={1}'.format(self.name, cur_node)) + configs.append(line) + line = ('component name={0}.dropout type=ElementwiseProductComponent ' + 'input-dim={1} output-dim={2} '.format( + self.name, 2 * output_dim, output_dim)) + configs.append(line) + line = ('component-node name={0}.dropout component={0}.dropout ' + 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' + ''.format(self.name, cur_node)) + configs.append(line) + cur_node = '{0}.dropout'.format(self.name) + continue else: raise RuntimeError("Unknown nonlinearity type: {0}" From 2181f4fc64ce5b6a3d5f42dd0172e95050491287 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 13 Feb 2018 22:22:55 -0500 Subject: [PATCH 03/10] [src,scripts] Add 'continuous' option for dropout. --- .../steps/libs/nnet3/xconfig/basic_layers.py | 9 ++- src/nnet3/nnet-general-component.cc | 75 +++++++++++++------ src/nnet3/nnet-general-component.h | 9 ++- src/nnet3/nnet-simple-component.h | 2 +- 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index a3dfa89cf0e..132fbdb6d82 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -689,6 +689,7 @@ def set_default_configs(self): # 'dropout' in the name 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout # mask is shared across time. + 'dropout-per-dim-continuous': False, 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities): self.name, nonlinearity, output_dim, self.config['dropout-proportion'])) else: + continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else '' line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' - 'output-dim={1} dropout-proportion={2}'.format( - self.name, output_dim, self.config['dropout-proportion'])) + 'output-dim={1} dropout-proportion={2} {3}'.format( + self.name, output_dim, self.config['dropout-proportion'], + continuous_opt)) configs.append(line) # note: the input to the dropout_mask component is never used, it's # just syntactically required. @@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities): line = ('component-node name={0}.dropout component={0}.dropout ' 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' ''.format(self.name, cur_node)) + + configs.append(line) cur_node = '{0}.dropout'.format(self.name) continue diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index dd6e950a7d1..1ccb3d254fc 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const { stream << Type() << ", output-dim=" << output_dim_ << ", dropout-proportion=" << dropout_proportion_; + if (continuous_) + stream << ", continuous=true"; return stream.str(); } DropoutMaskComponent::DropoutMaskComponent(): - output_dim_(-1), dropout_proportion_(0.5) { } + output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { } DropoutMaskComponent::DropoutMaskComponent( const DropoutMaskComponent &other): output_dim_(other.output_dim_), - dropout_proportion_(other.dropout_proportion_) { } + dropout_proportion_(other.dropout_proportion_), + continuous_(other.continuous_) { } void* DropoutMaskComponent::Propagate( const ComponentPrecomputedIndexes *indexes, @@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate( out->Set(1.0); return NULL; } + + if (continuous_) { + if (test_mode_) { + out->Set(1.0); + } else { + const_cast&>(random_generator_).RandUniform(out); + out->Scale(dropout_proportion * 4.0); + // make the expected value 1.0. + out->Add(1.0 - (2.0 * dropout_proportion)); + } + return NULL; + } + if (test_mode_) { out->Set(1.0 - dropout_proportion); return NULL; } + const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout_proportion); out->ApplyHeaviside(); - // To generate data where it's never the case that both of the dimensions - // for a row are zero, we generate uniformly distributed data (call this u_i), - // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) - // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) - int32 num_rows = out->NumRows(); - // later we may make this a bit more efficient. - CuVector temp(num_rows, kUndefined); - const_cast&>(random_generator_).RandUniform(&temp); - temp.Add(-dropout_proportion); - out->CopyColFromVec(temp, 0); - temp.Add(-1.0 + (2.0 * dropout_proportion)); - // Now, 'temp' contains the original uniformly-distributed data plus - // -(1 - dropout_proportion). - temp.Scale(-1.0); - out->CopyColFromVec(temp, 1); - out->ApplyHeaviside(); + + if (out->NumCols() == 2 || out->NumCols() == 3) { + // This is a kind of special case relevant to LSTms. + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } return NULL; } @@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); - std::string token; - ReadToken(is, binary, &token); - if (token == "") { + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &test_mode_); // read test mode - ExpectToken(is, binary, ""); } else { test_mode_ = false; - KALDI_ASSERT(token == ""); } + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + continuous_ = true; + } else { + continuous_ = false; + } + ExpectToken(is, binary, ""); } @@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dropout_proportion_); WriteToken(os, binary, ""); WriteBasicType(os, binary, test_mode_); + if (continuous_) + WriteToken(os, binary, ""); WriteToken(os, binary, ""); } @@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ASSERT(ok && output_dim_ > 0); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); + continuous_ = false; + cfl->GetValue("continuous", &continuous_); test_mode_ = false; cfl->GetValue("test-mode", &test_mode_); } diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 36829329d66..86614a1847f 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent { virtual std::string Info() const; // possible parameter values with their defaults: - // dropout-proportion=0.5 output-dim=-1 + // dropout-proportion=0.5 output-dim=-1 continuous=false + // With the 'continous=false' option (the default), it generates + // 0 with probability 'dropout-proportion' and 1 otherwise. + // With 'continuous=true' it outputs 1 plus dropout-proportion times + // a value uniformly distributed on [-2, 2]. (e.g. if dropout-proportion is + // 0.5, this would amount to a value uniformly distributed on [0,2].) virtual void InitFromConfig(ConfigLine *cfl); DropoutMaskComponent(); @@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent { BaseFloat dropout_proportion_; + bool continuous_; + const DropoutMaskComponent &operator = (const DropoutMaskComponent &other); // Disallow. }; diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index b1eb30a55bf..2d776180533 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; - // If true, we expect an extra 2 dimensions on the input, for dropout masks + // If true, we expect an extra 3 dimensions on the input, for dropout masks // for i_t and f_t. bool use_dropout_; From 99fcff919f625c79e346317c8cd0d70f3a60e25f Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 14 Feb 2018 23:35:59 -0500 Subject: [PATCH 04/10] [src,scripts] adding script-level and code support for continuous dropout --- .../steps/libs/nnet3/xconfig/basic_layers.py | 9 ++- src/nnet3/nnet-general-component.cc | 75 +++++++++++++------ src/nnet3/nnet-general-component.h | 9 ++- src/nnet3/nnet-simple-component.h | 2 +- 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index a3dfa89cf0e..132fbdb6d82 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -689,6 +689,7 @@ def set_default_configs(self): # 'dropout' in the name 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout # mask is shared across time. + 'dropout-per-dim-continuous': False, 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities): self.name, nonlinearity, output_dim, self.config['dropout-proportion'])) else: + continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else '' line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' - 'output-dim={1} dropout-proportion={2}'.format( - self.name, output_dim, self.config['dropout-proportion'])) + 'output-dim={1} dropout-proportion={2} {3}'.format( + self.name, output_dim, self.config['dropout-proportion'], + continuous_opt)) configs.append(line) # note: the input to the dropout_mask component is never used, it's # just syntactically required. @@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities): line = ('component-node name={0}.dropout component={0}.dropout ' 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' ''.format(self.name, cur_node)) + + configs.append(line) cur_node = '{0}.dropout'.format(self.name) continue diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index dd6e950a7d1..1ccb3d254fc 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const { stream << Type() << ", output-dim=" << output_dim_ << ", dropout-proportion=" << dropout_proportion_; + if (continuous_) + stream << ", continuous=true"; return stream.str(); } DropoutMaskComponent::DropoutMaskComponent(): - output_dim_(-1), dropout_proportion_(0.5) { } + output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { } DropoutMaskComponent::DropoutMaskComponent( const DropoutMaskComponent &other): output_dim_(other.output_dim_), - dropout_proportion_(other.dropout_proportion_) { } + dropout_proportion_(other.dropout_proportion_), + continuous_(other.continuous_) { } void* DropoutMaskComponent::Propagate( const ComponentPrecomputedIndexes *indexes, @@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate( out->Set(1.0); return NULL; } + + if (continuous_) { + if (test_mode_) { + out->Set(1.0); + } else { + const_cast&>(random_generator_).RandUniform(out); + out->Scale(dropout_proportion * 4.0); + // make the expected value 1.0. + out->Add(1.0 - (2.0 * dropout_proportion)); + } + return NULL; + } + if (test_mode_) { out->Set(1.0 - dropout_proportion); return NULL; } + const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout_proportion); out->ApplyHeaviside(); - // To generate data where it's never the case that both of the dimensions - // for a row are zero, we generate uniformly distributed data (call this u_i), - // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) - // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) - int32 num_rows = out->NumRows(); - // later we may make this a bit more efficient. - CuVector temp(num_rows, kUndefined); - const_cast&>(random_generator_).RandUniform(&temp); - temp.Add(-dropout_proportion); - out->CopyColFromVec(temp, 0); - temp.Add(-1.0 + (2.0 * dropout_proportion)); - // Now, 'temp' contains the original uniformly-distributed data plus - // -(1 - dropout_proportion). - temp.Scale(-1.0); - out->CopyColFromVec(temp, 1); - out->ApplyHeaviside(); + + if (out->NumCols() == 2 || out->NumCols() == 3) { + // This is a kind of special case relevant to LSTms. + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } return NULL; } @@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); - std::string token; - ReadToken(is, binary, &token); - if (token == "") { + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &test_mode_); // read test mode - ExpectToken(is, binary, ""); } else { test_mode_ = false; - KALDI_ASSERT(token == ""); } + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + continuous_ = true; + } else { + continuous_ = false; + } + ExpectToken(is, binary, ""); } @@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dropout_proportion_); WriteToken(os, binary, ""); WriteBasicType(os, binary, test_mode_); + if (continuous_) + WriteToken(os, binary, ""); WriteToken(os, binary, ""); } @@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ASSERT(ok && output_dim_ > 0); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); + continuous_ = false; + cfl->GetValue("continuous", &continuous_); test_mode_ = false; cfl->GetValue("test-mode", &test_mode_); } diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 36829329d66..86614a1847f 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent { virtual std::string Info() const; // possible parameter values with their defaults: - // dropout-proportion=0.5 output-dim=-1 + // dropout-proportion=0.5 output-dim=-1 continuous=false + // With the 'continous=false' option (the default), it generates + // 0 with probability 'dropout-proportion' and 1 otherwise. + // With 'continuous=true' it outputs 1 plus dropout-proportion times + // a value uniformly distributed on [-2, 2]. (e.g. if dropout-proportion is + // 0.5, this would amount to a value uniformly distributed on [0,2].) virtual void InitFromConfig(ConfigLine *cfl); DropoutMaskComponent(); @@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent { BaseFloat dropout_proportion_; + bool continuous_; + const DropoutMaskComponent &operator = (const DropoutMaskComponent &other); // Disallow. }; diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index b1eb30a55bf..2d776180533 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; - // If true, we expect an extra 2 dimensions on the input, for dropout masks + // If true, we expect an extra 3 dimensions on the input, for dropout masks // for i_t and f_t. bool use_dropout_; From 9bddde88964e8ab5830995b4c346476cd90f4c15 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 16 Feb 2018 22:09:45 -0500 Subject: [PATCH 05/10] [src] Small fix to cu-kernels.cu RE compression kernel --- src/cudamatrix/cu-kernels.cu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index b0468b7fa7c..ae7e25b716d 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3601,7 +3601,7 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim, src_index = i + j * dim.stride; const int min_value = minimum_integer_value(), max_value = maximum_integer_value(); - int16_t compressed_value; + int compressed_value; int ok = (i < dim.cols && j < dim.rows); if (ok) { float f = src[src_index]; @@ -3611,7 +3611,6 @@ static void _cuda_compress_bounds_check(const BaseFloat *src, MatrixDim dim, // range of 'int', and if it fails, we've probably already catastrophically // diverged. int i = __float2int_rn(f * inv_scale); - // note: SignedInt will be int8 or (more likely) int16. if (i < min_value) compressed_value = min_value; else if (i > max_value) compressed_value = max_value; else compressed_value = i; From 82d2e7505ed5dc163f8205830fb4a9db88f13456 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 13 Feb 2018 22:22:55 -0500 Subject: [PATCH 06/10] [src,scripts] Add 'continuous' option for dropout. --- .../steps/libs/nnet3/xconfig/basic_layers.py | 9 ++- src/nnet3/nnet-general-component.cc | 75 +++++++++++++------ src/nnet3/nnet-general-component.h | 9 ++- src/nnet3/nnet-simple-component.h | 2 +- 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index a3dfa89cf0e..132fbdb6d82 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -689,6 +689,7 @@ def set_default_configs(self): # 'dropout' in the name 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout # mask is shared across time. + 'dropout-per-dim-continuous': False, 'add-log-stddev': False, # the following are not really inspected by this level of # code, just passed through (but not if left at ''). @@ -870,9 +871,11 @@ def _add_components(self, input_desc, input_dim, nonlinearities): self.name, nonlinearity, output_dim, self.config['dropout-proportion'])) else: + continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else '' line = ('component name={0}.dropout_mask type=DropoutMaskComponent ' - 'output-dim={1} dropout-proportion={2}'.format( - self.name, output_dim, self.config['dropout-proportion'])) + 'output-dim={1} dropout-proportion={2} {3}'.format( + self.name, output_dim, self.config['dropout-proportion'], + continuous_opt)) configs.append(line) # note: the input to the dropout_mask component is never used, it's # just syntactically required. @@ -886,6 +889,8 @@ def _add_components(self, input_desc, input_dim, nonlinearities): line = ('component-node name={0}.dropout component={0}.dropout ' 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' ''.format(self.name, cur_node)) + + configs.append(line) cur_node = '{0}.dropout'.format(self.name) continue diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index dd6e950a7d1..1ccb3d254fc 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1392,16 +1392,19 @@ std::string DropoutMaskComponent::Info() const { stream << Type() << ", output-dim=" << output_dim_ << ", dropout-proportion=" << dropout_proportion_; + if (continuous_) + stream << ", continuous=true"; return stream.str(); } DropoutMaskComponent::DropoutMaskComponent(): - output_dim_(-1), dropout_proportion_(0.5) { } + output_dim_(-1), dropout_proportion_(0.5), continuous_(false) { } DropoutMaskComponent::DropoutMaskComponent( const DropoutMaskComponent &other): output_dim_(other.output_dim_), - dropout_proportion_(other.dropout_proportion_) { } + dropout_proportion_(other.dropout_proportion_), + continuous_(other.continuous_) { } void* DropoutMaskComponent::Propagate( const ComponentPrecomputedIndexes *indexes, @@ -1415,29 +1418,47 @@ void* DropoutMaskComponent::Propagate( out->Set(1.0); return NULL; } + + if (continuous_) { + if (test_mode_) { + out->Set(1.0); + } else { + const_cast&>(random_generator_).RandUniform(out); + out->Scale(dropout_proportion * 4.0); + // make the expected value 1.0. + out->Add(1.0 - (2.0 * dropout_proportion)); + } + return NULL; + } + if (test_mode_) { out->Set(1.0 - dropout_proportion); return NULL; } + const_cast&>(random_generator_).RandUniform(out); out->Add(-dropout_proportion); out->ApplyHeaviside(); - // To generate data where it's never the case that both of the dimensions - // for a row are zero, we generate uniformly distributed data (call this u_i), - // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) - // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) - int32 num_rows = out->NumRows(); - // later we may make this a bit more efficient. - CuVector temp(num_rows, kUndefined); - const_cast&>(random_generator_).RandUniform(&temp); - temp.Add(-dropout_proportion); - out->CopyColFromVec(temp, 0); - temp.Add(-1.0 + (2.0 * dropout_proportion)); - // Now, 'temp' contains the original uniformly-distributed data plus - // -(1 - dropout_proportion). - temp.Scale(-1.0); - out->CopyColFromVec(temp, 1); - out->ApplyHeaviside(); + + if (out->NumCols() == 2 || out->NumCols() == 3) { + // This is a kind of special case relevant to LSTms. + // To generate data where it's never the case that both of the dimensions + // for a row are zero, we generate uniformly distributed data (call this u_i), + // and for row i, set (*out)(i, 0) = (0 if u_i < dropout_proportion else 1) + // and (*out)(i, 1) = (0 if u_i > 1-dropout_proportion else 1) + int32 num_rows = out->NumRows(); + // later we may make this a bit more efficient. + CuVector temp(num_rows, kUndefined); + const_cast&>(random_generator_).RandUniform(&temp); + temp.Add(-dropout_proportion); + out->CopyColFromVec(temp, 0); + temp.Add(-1.0 + (2.0 * dropout_proportion)); + // Now, 'temp' contains the original uniformly-distributed data plus + // -(1 - dropout_proportion). + temp.Scale(-1.0); + out->CopyColFromVec(temp, 1); + out->ApplyHeaviside(); + } return NULL; } @@ -1447,15 +1468,19 @@ void DropoutMaskComponent::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &output_dim_); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &dropout_proportion_); - std::string token; - ReadToken(is, binary, &token); - if (token == "") { + if (PeekToken(is, binary) == 'T') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &test_mode_); // read test mode - ExpectToken(is, binary, ""); } else { test_mode_ = false; - KALDI_ASSERT(token == ""); } + if (PeekToken(is, binary) == 'C') { + ExpectToken(is, binary, ""); + continuous_ = true; + } else { + continuous_ = false; + } + ExpectToken(is, binary, ""); } @@ -1467,6 +1492,8 @@ void DropoutMaskComponent::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, dropout_proportion_); WriteToken(os, binary, ""); WriteBasicType(os, binary, test_mode_); + if (continuous_) + WriteToken(os, binary, ""); WriteToken(os, binary, ""); } @@ -1480,6 +1507,8 @@ void DropoutMaskComponent::InitFromConfig(ConfigLine *cfl) { KALDI_ASSERT(ok && output_dim_ > 0); dropout_proportion_ = 0.5; cfl->GetValue("dropout-proportion", &dropout_proportion_); + continuous_ = false; + cfl->GetValue("continuous", &continuous_); test_mode_ = false; cfl->GetValue("test-mode", &test_mode_); } diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index 36829329d66..86614a1847f 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -715,7 +715,12 @@ class DropoutMaskComponent: public RandomComponent { virtual std::string Info() const; // possible parameter values with their defaults: - // dropout-proportion=0.5 output-dim=-1 + // dropout-proportion=0.5 output-dim=-1 continuous=false + // With the 'continous=false' option (the default), it generates + // 0 with probability 'dropout-proportion' and 1 otherwise. + // With 'continuous=true' it outputs 1 plus dropout-proportion times + // a value uniformly distributed on [-2, 2]. (e.g. if dropout-proportion is + // 0.5, this would amount to a value uniformly distributed on [0,2].) virtual void InitFromConfig(ConfigLine *cfl); DropoutMaskComponent(); @@ -771,6 +776,8 @@ class DropoutMaskComponent: public RandomComponent { BaseFloat dropout_proportion_; + bool continuous_; + const DropoutMaskComponent &operator = (const DropoutMaskComponent &other); // Disallow. }; diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index b1eb30a55bf..2d776180533 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -2176,7 +2176,7 @@ class LstmNonlinearityComponent: public UpdatableComponent { // it contains the 3 diagonal parameter matrices w_i, w_f and w_o. CuMatrix params_; - // If true, we expect an extra 2 dimensions on the input, for dropout masks + // If true, we expect an extra 3 dimensions on the input, for dropout masks // for i_t and f_t. bool use_dropout_; From 83b97da1d6361651c9d19dfe8070feb596766c7b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 18 Feb 2018 17:31:08 -0500 Subject: [PATCH 07/10] [src] Prevent NaNs from appearing in chain forward-backward when nnet output is a bit out of range. --- .../s5c/local/chain/tuning/run_tdnn_7l.sh | 2 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 20 ++++++++---- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 + src/chain/chain-denominator.cc | 5 ++- src/cudamatrix/cu-kernels-ansi.h | 4 +++ src/cudamatrix/cu-kernels.cu | 32 +++++++++++++++++++ src/cudamatrix/cu-kernels.h | 8 +++++ src/cudamatrix/cu-matrix-test.cc | 25 +++++++++++++++ src/cudamatrix/cu-matrix.cc | 31 ++++++++++++++++++ src/cudamatrix/cu-matrix.h | 7 ++++ src/nnet3/nnet-general-component.cc | 2 +- 11 files changed, 128 insertions(+), 9 deletions(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh index f7681a743e1..43073895382 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh @@ -10,7 +10,7 @@ #Final valid prob -0.113 -0.116 #Final train prob (xent) -1.25 -1.38 #Final valid prob (xent) -1.36 -1.48 -#Time consuming one iter 53.56s 48.18s +#Time consuming one iter 53.56s 48.18s #Time reduction percent 10.1% set -e diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index 132fbdb6d82..2cacbf43d16 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -688,7 +688,9 @@ def set_default_configs(self): # affects layers with # 'dropout' in the name 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout - # mask is shared across time. + # mask is shared across time. Or you can set dropout-period + 'dropout-period': 0, # if set to a nonzero value (e.g. 10), we'll share the + # dropout mask across chunks of time values. 'dropout-per-dim-continuous': False, 'add-log-stddev': False, # the following are not really inspected by this level of @@ -865,7 +867,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities): ''.format(self.name, nonlinearity, output_dim)) elif nonlinearity == 'dropout': - if not self.config['dropout-per-dim']: + if not self.config['dropout-per-dim'] and self.config['dropout-period'] == 0: line = ('component name={0}.{1} type=DropoutComponent ' 'dim={2} dropout-proportion={3}'.format( self.name, nonlinearity, output_dim, @@ -886,10 +888,16 @@ def _add_components(self, input_desc, input_dim, nonlinearities): 'input-dim={1} output-dim={2} '.format( self.name, 2 * output_dim, output_dim)) configs.append(line) - line = ('component-node name={0}.dropout component={0}.dropout ' - 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' - ''.format(self.name, cur_node)) - + if self.config['dropout-per-dim']: + line = ('component-node name={0}.dropout component={0}.dropout ' + 'input=Append({1}, ReplaceIndex({0}.dropout_mask, t, 0))' + ''.format(self.name, cur_node)) + else: + dropout_period = self.config['dropout-period'] + assert dropout_period > 0 + line = ('component-node name={0}.dropout component={0}.dropout ' + 'input=Append({1}, Round({0}.dropout_mask, {2}))' + ''.format(self.name, cur_node, dropout_period)) configs.append(line) cur_node = '{0}.dropout'.format(self.name) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 6fbde1fbbcc..fe25d95df91 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -23,6 +23,7 @@ 'relu-layer' : xlayers.XconfigBasicLayer, 'relu-renorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, + 'relu-dropout-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-dropout-layer': xlayers.XconfigBasicLayer, 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, diff --git a/src/chain/chain-denominator.cc b/src/chain/chain-denominator.cc index 620ea873eb7..1e2eb42a690 100644 --- a/src/chain/chain-denominator.cc +++ b/src/chain/chain-denominator.cc @@ -57,7 +57,10 @@ DenominatorComputation::DenominatorComputation( num_sequences_).SetZero(); KALDI_ASSERT(nnet_output.NumRows() % num_sequences == 0); - exp_nnet_output_transposed_.ApplyExp(); + // We limit the nnet output to the range [-30,30] before doing the exp; + // this avoids NaNs appearing in the forward-backward computation, which + // is not done in log space. + exp_nnet_output_transposed_.ApplyExpLimited(-30.0, 30.0); } diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index 8ab03c7e14e..ac2f15aa2e2 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -194,6 +194,10 @@ void cudaF_apply_ceiling(dim3 Gr, dim3 Bl, float* mat, float ceiling_val, MatrixDim d); void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d); void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d); +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit); +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit); void cudaD_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride); void cudaF_apply_exp_special(dim3 Gr, dim3 Bl, float* out, MatrixDim out_dim, diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index ae7e25b716d..6ca8b107dad 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -400,6 +400,26 @@ static void _apply_exp(Real* mat, MatrixDim d) { } } +template +__global__ +static void _apply_exp_limited(Real* mat, MatrixDim d, + Real lower_limit, Real upper_limit) { + int32_cuda i = blockIdx.x * blockDim.x + threadIdx.x; + int32_cuda j = blockIdx.y * blockDim.y + threadIdx.y; + int32_cuda index = i + j * d.stride; + if (i < d.cols && j < d.rows) { + Real x = mat[index]; + // I'm writing !(x >= lower_limit) instead of (x < lower_limit) so that + // nan's will be set to the lower-limit. + if (!(x >= lower_limit)) + x = lower_limit; + else if (x > upper_limit) + x = upper_limit; + mat[index] = exp(x); + } +} + + template __global__ static void _scale_diag_packed(Real* mat, Real value, int dim) { @@ -3717,6 +3737,11 @@ void cudaF_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaF_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + void cudaF_apply_pow(dim3 Gr, dim3 Bl, float* mat, float power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } @@ -4407,6 +4432,13 @@ void cudaD_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { _apply_exp<<>>(mat,d); } +void cudaD_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + _apply_exp_limited<<>>(mat, d, lower_limit, upper_limit); +} + + + void cudaD_apply_pow(dim3 Gr, dim3 Bl, double* mat, double power, MatrixDim d) { _apply_pow<<>>(mat, power, d); } diff --git a/src/cudamatrix/cu-kernels.h b/src/cudamatrix/cu-kernels.h index 3518e0c71ed..871c959f5d4 100644 --- a/src/cudamatrix/cu-kernels.h +++ b/src/cudamatrix/cu-kernels.h @@ -335,6 +335,14 @@ inline void cuda_apply_exp(dim3 Gr, dim3 Bl, double* mat, MatrixDim d) { inline void cuda_apply_exp(dim3 Gr, dim3 Bl, float* mat, MatrixDim d) { cudaF_apply_exp(Gr, Bl, mat, d); } +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, double* mat, MatrixDim d, + double lower_limit, double upper_limit) { + cudaD_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} +inline void cuda_apply_exp_limited(dim3 Gr, dim3 Bl, float* mat, MatrixDim d, + float lower_limit, float upper_limit) { + cudaF_apply_exp_limited(Gr, Bl, mat, d, lower_limit, upper_limit); +} inline void cuda_apply_exp_special(dim3 Gr, dim3 Bl, double* out, MatrixDim out_dim, const double* in, int in_stride) { diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 909e5552a35..baee5f98e60 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -194,6 +194,30 @@ static void UnitTestCuMatrixApplyExp() { } +template +static void UnitTestCuMatrixApplyExpLimited() { + int32 M = 10 + Rand() % 20, N = 10 + Rand() % 20; + Matrix H(M, N); + H.SetRandn(); + + + BaseFloat lower_limit = -0.2, upper_limit = 0.2; + + CuMatrix D(H); + + D.ApplyExpLimited(lower_limit, upper_limit); + + + H.ApplyFloor(lower_limit); + H.ApplyCeiling(upper_limit); + H.ApplyExp(); + + Matrix H2(D); + + AssertEqual(H,H2); +} + + template static void UnitTestCuMatrixSigmoid() { @@ -2859,6 +2883,7 @@ static void UnitTestCuMatrixEqualElementMask() { template void CudaMatrixUnitTest() { UnitTestCuMatrixApplyExpSpecial(); + UnitTestCuMatrixApplyExpLimited(); UnitTextCuMatrixAddSmatMat(); UnitTextCuMatrixAddMatSmat(); UnitTextCuMatrixAddSmat(); diff --git a/src/cudamatrix/cu-matrix.cc b/src/cudamatrix/cu-matrix.cc index 813c5e75d14..a34804f534e 100644 --- a/src/cudamatrix/cu-matrix.cc +++ b/src/cudamatrix/cu-matrix.cc @@ -2498,6 +2498,37 @@ void CuMatrixBase::ApplyExp() { } } +template +void CuMatrixBase::ApplyExpLimited(Real lower_limit, Real upper_limit) { + KALDI_ASSERT(upper_limit > lower_limit); +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + CuTimer tim; + dim3 dimGrid, dimBlock; + GetBlockSizesForSimpleMatrixOperation(NumRows(), NumCols(), + &dimGrid, &dimBlock); + cuda_apply_exp_limited(dimGrid, dimBlock, data_, Dim(), lower_limit, upper_limit); + CU_SAFE_CALL(cudaGetLastError()); + CuDevice::Instantiate().AccuProfile(__func__, tim); + } else +#endif + { + int32 num_rows = num_rows_, num_cols = num_cols_; + for (int32 r = 0; r < num_rows; r++) { + Real *row_data = this->RowData(r); + for (int32 c = 0; c < num_cols; c++) { + Real x = row_data[c]; + if (!(x >= lower_limit)) + x = lower_limit; + if (x > upper_limit) + x = upper_limit; + row_data[c] = Exp(x); + } + } + } +} + + template void CuMatrixBase::ApplyExpSpecial() { #if HAVE_CUDA == 1 diff --git a/src/cudamatrix/cu-matrix.h b/src/cudamatrix/cu-matrix.h index 7c3a2a2e11f..0fa022e1569 100644 --- a/src/cudamatrix/cu-matrix.h +++ b/src/cudamatrix/cu-matrix.h @@ -390,6 +390,13 @@ class CuMatrixBase { void ApplyCeiling(Real ceiling_val); void ApplyExp(); + + /// This is equivalent to running: + /// ApplyFloor(lower_limit); + /// ApplyCeiling(upper_limit); + /// ApplyExp() + void ApplyExpLimited(Real lower_limit, Real upper_limit); + /// For each element x of the matrix, set it to /// (x < 0 ? exp(x) : x + 1). This function is used /// in our RNNLM training. diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 1ccb3d254fc..20706c11ad8 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1414,7 +1414,7 @@ void* DropoutMaskComponent::Propagate( BaseFloat dropout_proportion = dropout_proportion_; KALDI_ASSERT(dropout_proportion >= 0.0 && dropout_proportion <= 1.0); - if (dropout_proportion_ == 0) { + if (dropout_proportion == 0) { out->Set(1.0); return NULL; } From 2b24aaa2ba93a2330d5649f9ee25c01c94aac910 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 24 Mar 2018 23:47:22 -0400 Subject: [PATCH 08/10] [egs] revert some changes --- egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh | 2 +- egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh index 43073895382..f7681a743e1 100644 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7l.sh @@ -10,7 +10,7 @@ #Final valid prob -0.113 -0.116 #Final train prob (xent) -1.25 -1.38 #Final valid prob (xent) -1.36 -1.48 -#Time consuming one iter 53.56s 48.18s +#Time consuming one iter 53.56s 48.18s #Time reduction percent 10.1% set -e diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py index 0019f5700c3..99911b39fb2 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/parser.py @@ -23,7 +23,6 @@ 'relu-layer' : xlayers.XconfigBasicLayer, 'relu-renorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, - 'relu-dropout-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-dropout-layer': xlayers.XconfigBasicLayer, 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, From bfdb7a7b100fd72fec4c6501ab6950db81b06295 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 24 Mar 2018 23:36:19 -0400 Subject: [PATCH 09/10] [egs] some script updates Conflicts: egs/swbd/s5c/local/chain/tuning/run_tdnn_7m26h2.sh egs/swbd/s5c/local/chain/tuning/run_tdnn_7m26l.sh --- egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh index 753dfc632ba..b927cc86823 100755 --- a/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh +++ b/egs/swbd/s5c/local/chain/tuning/run_tdnn_7o.sh @@ -18,7 +18,7 @@ # # # local/chain/compare_wer_general.sh --rt03 tdnn7n_sp tdnn7m26o_sp -# System tdnn7n_sp tdnn7m26j_sp +# System tdnn7n_sp tdnn7m26o_sp # WER on train_dev(tg) 12.18 11.74 # WER on train_dev(fg) 11.12 10.69 # WER on eval2000(tg) 14.9 14.6 From 78499c13befce4787edbcd2a255a9e58b6b5f4fd Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sat, 24 Mar 2018 23:30:57 -0400 Subject: [PATCH 10/10] [src] documentation improvements --- src/nnet3/nnet-simple-component.cc | 6 +---- src/nnet3/nnet-simple-component.h | 38 ++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index f9f286aaed2..4eb078c0fcb 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -3730,15 +3730,11 @@ void NaturalGradientPerElementScaleComponent::InitFromConfig(ConfigLine *cfl) { // for the preconditioner actually exceeds the memory for the // parameters (by "rank"). update_period = 10; - BaseFloat num_samples_history = 2000.0, alpha = 4.0, - max_change_per_minibatch = 0.0; + BaseFloat num_samples_history = 2000.0, alpha = 4.0; cfl->GetValue("rank", &rank); cfl->GetValue("update-period", &update_period); cfl->GetValue("num-samples-history", &num_samples_history); cfl->GetValue("alpha", &alpha); - cfl->GetValue("max-change-per-minibatch", &max_change_per_minibatch); - if (max_change_per_minibatch != 0.0) - KALDI_WARN << "max-change-per-minibatch is now ignored, use 'max-change'"; InitLearningRatesFromConfig(cfl); std::string filename; // Accepts "scales" config (for filename) or "dim" -> random init, for testing. diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 9d438678f5d..3929c253aab 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1446,6 +1446,19 @@ class PermuteComponent: public Component { trainable scale; it's like a linear component with a diagonal matrix. This version (and its child class NaturalGradientPerElementScaleComponent) requires the input for backprop. See also ScaleAndOffsetComponent. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. */ class PerElementScaleComponent: public UpdatableComponent { public: @@ -1670,8 +1683,29 @@ class ConstantFunctionComponent: public UpdatableComponent { -// NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but -// it uses a natural gradient update for the per-element scales. +/** + NaturalGradientPerElementScaleComponent is like PerElementScaleComponent but + it uses a natural gradient update for the per-element scales. + + Accepted values on its config line, with defaults if applicable: + + vector If specified, the offsets will be read from this file ('vector' + is interpreted as an rxfilename). + + dim The dimension that this component inputs and outputs. + Only required if 'vector' is not specified. + + param-mean=1.0 Mean of randomly initialized offset parameters; should only + be supplied if 'vector' is not supplied. + param-stddev=0.0 Standard deviation of randomly initialized offset parameters; + should only be supplied if 'vector' is not supplied. + + And the natural-gradient-related configuration values: + rank=8 + update-period=10 + num-samples-history=2000.0 + alpha=4.0 +*/ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { public: