From 954e13cb5fbca65bcf294fe9c96cfc39df226a59 Mon Sep 17 00:00:00 2001 From: Xiaohui Zhang Date: Thu, 22 Mar 2018 14:03:39 -0400 Subject: [PATCH 1/5] [src] Make ClipGradientComponent zero gradients when clipping threshold is zero (#2301) --- src/nnet3/nnet-simple-component.cc | 6 +++++- src/nnet3/nnet-simple-component.h | 29 +++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 6c766fc5df1..f9f286aaed2 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -1,8 +1,10 @@ // nnet3/nnet-simple-component.cc // Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// 2015 Xiaohui Zhang // 2015 Guoguo Chen // 2015 Daniel Galvez +// 2016 Yiming Wang // See ../../COPYING for clarification regarding multiple authors // @@ -642,11 +644,13 @@ void ClipGradientComponent::Backprop(const std::string &debug_info, to_update->num_backpropped_ += 1; RepairGradients(debug_info, in_value, in_deriv, to_update); } + } else if (clipping_threshold_ == 0.0) { + in_deriv->SetZero(); } } // This function will add a self-repair term to in-deriv, attempting to shrink -// the maginitude of the input towards self_repair_target_. +// the magnitude of the input towards self_repair_target_. // This term is proportional to [-(input vector - self_repair_target_)]. // The avarage magnitude of this term is equal to // [self_repair_scale_ * clipped_proportion * average norm of input derivative]. diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 7219361a358..9d438678f5d 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -1221,10 +1221,31 @@ class SumBlockComponent: public Component { }; -// ClipGradientComponent just duplicates its input, but clips gradients -// during backpropagation if they cross a predetermined threshold. -// This component will be used to prevent gradient explosion problem in -// recurrent neural networks +/* + ClipGradientComponent just duplicates its input, but clips gradients + during backpropagation if they cross a predetermined threshold. + This component will be used to prevent gradient explosion problem in + recurrent neural networks. + + Configuration values accepted: + dim Dimension of this component, e.g. 1024 + clipping-threshold Threshold to be used for clipping. It could correspond + to max-row-norm (if norm_based_clipping_ == true) or + max-absolute-value (otherwise). + norm-based-clipping If true, the max-row-norm will be clipped. Else element-wise + absolute value clipping is done. + self-repair-clipped-proportion-threshold The threshold of clipped-proportion + for self-repair mechanism to be activated. The self-repair mechanism + adds a term (proportional to [-(input vector - self_repair_target_)]) + to in-deriv, attempting to shrink the maginitude of the input towards + self_repair_target_ (e.g. 0.0 or 0.5). The default value is 1.0. + self-repair-target The target value towards which self-repair is trying to set + for in-deriv. The default value is 0.0. + self-repair-scale Scale for the self-repair mechanism; see comments above. + The default value is 0.0, but we usually set this to 1.0e-05 (or + occasionally 1.0e-04) in the scripts. +*/ + class ClipGradientComponent: public Component { public: ClipGradientComponent(int32 dim, BaseFloat clipping_threshold, From dbd513c70db8d13c20c1291d25b21799c3cdb453 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 22 Mar 2018 15:44:29 -0400 Subject: [PATCH 2/5] [src] Make CachingOptimizingCompiler thread safe. Thx: Arseniy Gorin (#2288) --- src/nnet3/natural-gradient-online.h | 1 - src/nnet3/nnet-am-decodable-simple.cc | 2 +- src/nnet3/nnet-chain-diagnostics.cc | 2 +- src/nnet3/nnet-chain-training.cc | 2 +- src/nnet3/nnet-component-itf.h | 1 - src/nnet3/nnet-computation.cc | 17 ++ src/nnet3/nnet-computation.h | 16 ++ src/nnet3/nnet-diagnostics.cc | 20 +- src/nnet3/nnet-discriminative-diagnostics.cc | 3 +- src/nnet3/nnet-discriminative-training.cc | 2 +- src/nnet3/nnet-optimize-utils.cc | 121 ++++++++++ src/nnet3/nnet-optimize-utils.h | 68 +++++- src/nnet3/nnet-optimize.cc | 237 +++++++------------ src/nnet3/nnet-optimize.h | 80 ++----- src/nnet3/nnet-training.cc | 2 +- src/nnet3bin/nnet3-xvector-compute.cc | 2 +- src/rnnlm/rnnlm-core-compute.cc | 2 +- src/rnnlm/rnnlm-core-training.cc | 12 +- 18 files changed, 350 insertions(+), 240 deletions(-) diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index 0b05948977e..b49769da540 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -21,7 +21,6 @@ #define KALDI_NNET3_NATURAL_GRADIENT_ONLINE_H_ #include -#include #include "base/kaldi-common.h" #include "matrix/matrix-lib.h" #include "cudamatrix/cu-matrix-lib.h" diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc index 35b1506336e..341f87f0372 100644 --- a/src/nnet3/nnet-am-decodable-simple.cc +++ b/src/nnet3/nnet-am-decodable-simple.cc @@ -248,7 +248,7 @@ void DecodableNnetSimple::DoNnetComputation( request.outputs.resize(1); request.outputs[0].Swap(&output_spec); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); Nnet *nnet_to_update = NULL; // we're not doing any update. NnetComputer computer(opts_.compute_config, *computation, nnet_, nnet_to_update); diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc index 084b33347df..a7e60a5e0c4 100644 --- a/src/nnet3/nnet-chain-diagnostics.cc +++ b/src/nnet3/nnet-chain-diagnostics.cc @@ -100,7 +100,7 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) { GetChainComputationRequest(nnet_, chain_eg, need_model_derivative, store_component_stats, use_xent_regularization, use_xent_derivative, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputer computer(nnet_config_.compute_config, *computation, nnet_, deriv_nnet_); // give the inputs to the computer object. diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 844fb82d32a..1d149b6f193 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -68,7 +68,7 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { nnet_config.store_component_stats, use_xent_regularization, need_model_derivative, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); if (nnet_config.backstitch_training_scale > 0.0 && num_minibatches_processed_ % nnet_config.backstitch_training_interval == diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 79a1f1a5602..01697353308 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -23,7 +23,6 @@ #define KALDI_NNET3_NNET_COMPONENT_ITF_H_ #include -#include #include "nnet3/nnet-common.h" #include "nnet3/nnet-parse.h" #include "base/kaldi-error.h" diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc index bb0e7c917fc..520d296ee21 100644 --- a/src/nnet3/nnet-computation.cc +++ b/src/nnet3/nnet-computation.cc @@ -1199,6 +1199,23 @@ size_t IoSpecificationHasher::operator () ( (io_spec.has_deriv ? 4261 : 0); } +// ComputationRequests are distinguished by the names and indexes +// of inputs and outputs +size_t ComputationRequestHasher::operator() ( + const ComputationRequest *cr) const noexcept { + size_t ans = 0; + size_t p1 = 4111, p2 = 26951; + IoSpecificationHasher io_hasher; + std::vector::const_iterator itr = cr->inputs.begin(), + end = cr->inputs.end(); + for (; itr != end; ++itr) + ans = ans * p1 + io_hasher(*itr); + itr = cr->outputs.begin(); + end = cr->outputs.end(); + for (; itr != end; ++itr) + ans = ans * p2 + io_hasher(*itr); + return ans; +} diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h index aefcb94c465..97d8b9045ea 100644 --- a/src/nnet3/nnet-computation.h +++ b/src/nnet3/nnet-computation.h @@ -157,6 +157,22 @@ struct ComputationRequest { bool operator== (const ComputationRequest &other) const; }; +// Hash function for ComputationRequest. It converts +// ComputationRequest to hash code by looking at input +// and output IoSpecifications vectors. +struct ComputationRequestHasher { + size_t operator()(const ComputationRequest *cr) const noexcept; +}; + +// Equality function for ComputationRequest pointer +struct ComputationRequestPtrEqual { + public: + bool operator() (const ComputationRequest* cr1, + const ComputationRequest* cr2) const { + return (*cr1) == (*cr2); + } +}; + /** CommandType is an enum that describes the category of the command used in diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc index 8f9f1be24e4..2a6cfe5de6a 100644 --- a/src/nnet3/nnet-diagnostics.cc +++ b/src/nnet3/nnet-diagnostics.cc @@ -83,7 +83,7 @@ void NnetComputeProb::Compute(const NnetExample &eg) { GetComputationRequest(nnet_, eg, need_model_derivative, store_component_stats, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputer computer(config_.compute_config, *computation, nnet_, deriv_nnet_); // give the inputs to the computer object. @@ -122,11 +122,11 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, totals.tot_objective += tot_objf; } // May not be meaningful in non-classification tasks - if (config_.compute_accuracy) { + if (config_.compute_accuracy) { BaseFloat tot_weight, tot_accuracy; PerDimObjectiveInfo &acc_totals = accuracy_info_[io.name]; - if (config_.compute_per_dim_accuracy && + if (config_.compute_per_dim_accuracy && acc_totals.tot_objective_vec.Dim() == 0) { acc_totals.tot_objective_vec.Resize(output.NumCols()); acc_totals.tot_weight_vec.Resize(output.NumCols()); @@ -134,9 +134,9 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, ComputeAccuracy(io.features, output, &tot_weight, &tot_accuracy, - config_.compute_per_dim_accuracy ? + config_.compute_per_dim_accuracy ? &acc_totals.tot_weight_vec : NULL, - config_.compute_per_dim_accuracy ? + config_.compute_per_dim_accuracy ? &acc_totals.tot_objective_vec : NULL); acc_totals.tot_weight += tot_weight; acc_totals.tot_objective += tot_accuracy; @@ -149,7 +149,7 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg, bool NnetComputeProb::PrintTotalStats() const { bool ans = false; { // First print regular objectives - unordered_map::const_iterator iter, end; iter = objf_info_.begin(); end = objf_info_.end(); @@ -168,8 +168,8 @@ bool NnetComputeProb::PrintTotalStats() const { ans = true; } } - { - unordered_map::const_iterator iter, end; // now print accuracies. iter = accuracy_info_.begin(); @@ -185,14 +185,14 @@ bool NnetComputeProb::PrintTotalStats() const { Vector accuracy_vec(info.tot_weight_vec.Dim()); for (size_t j = 0; j < info.tot_weight_vec.Dim(); j++) { if (info.tot_weight_vec(j) != 0) { - accuracy_vec(j) = info.tot_objective_vec(j) + accuracy_vec(j) = info.tot_objective_vec(j) / info.tot_weight_vec(j); } else { accuracy_vec(j) = -1.0; } } - KALDI_LOG << "Overall per-dim accuracy vector for '" << name + KALDI_LOG << "Overall per-dim accuracy vector for '" << name << "' is " << accuracy_vec << " per frame" << ", over " << info.tot_weight << " frames."; } diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc index f23af549d72..488372be8e1 100644 --- a/src/nnet3/nnet-discriminative-diagnostics.cc +++ b/src/nnet3/nnet-discriminative-diagnostics.cc @@ -78,7 +78,7 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg) store_component_stats, use_xent_regularization, use_xent_derivative, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputer computer(nnet_config_.compute_config, *computation, nnet_, deriv_nnet_); // give the inputs to the computer object. @@ -206,4 +206,3 @@ const discriminative::DiscriminativeObjectiveInfo* NnetDiscriminativeComputeObjf } // namespace nnet3 } // namespace kaldi - diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc index 0a436b69f8c..91a72c73cca 100644 --- a/src/nnet3/nnet-discriminative-training.cc +++ b/src/nnet3/nnet-discriminative-training.cc @@ -70,7 +70,7 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) { use_xent_regularization, need_model_derivative, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputer computer(nnet_config.compute_config, *computation, *nnet_, diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc index 756ea45e894..e587c7ff947 100644 --- a/src/nnet3/nnet-optimize-utils.cc +++ b/src/nnet3/nnet-optimize-utils.cc @@ -4950,5 +4950,126 @@ void OptimizeMemoryCompression(const Nnet &nnet, } +std::shared_ptr ComputationCache::Find( + const ComputationRequest &in_request) { + std::lock_guard lock(mutex_); + + CacheType::iterator iter = computation_cache_.find(&in_request); + if (iter == computation_cache_.end()) { + return NULL; + } else { + std::shared_ptr ans = iter->second.first; + // Update access record by moving the accessed request to the end of the + // access queue, which declares that it's the most recently used. + access_queue_.splice(access_queue_.end(), access_queue_, + iter->second.second); + return ans; + } +} + + +ComputationCache::ComputationCache(int32 cache_capacity): + cache_capacity_(cache_capacity) { + KALDI_ASSERT(cache_capacity > 0); +} + +std::shared_ptr ComputationCache::Insert( + const ComputationRequest &request_in, + const NnetComputation *computation_in) { + + std::lock_guard lock(mutex_); + if (static_cast(computation_cache_.size()) >= cache_capacity_) { + // Cache has reached capacity; purge the least-recently-accessed request + const CacheType::iterator iter = + computation_cache_.find(access_queue_.front()); + KALDI_ASSERT(iter != computation_cache_.end()); + const ComputationRequest *request = iter->first; + computation_cache_.erase(iter); + delete request; + // we don't need to delete the computation in iter->second.first, as the + // shared_ptr takes care of that automatically. + access_queue_.pop_front(); + } + + // Now insert the thing we need to insert. We'll own the pointer 'request' in + // 'computation_cache_', so we need to allocate our own version. + ComputationRequest *request = new ComputationRequest(request_in); + // When we construct this shared_ptr, it takes ownership of the pointer + // 'computation_in'. + std::shared_ptr computation(computation_in); + + AqType::iterator ait = access_queue_.insert(access_queue_.end(), request); + + std::pair p = computation_cache_.insert( + std::make_pair(request, std::make_pair(computation, ait))); + if (!p.second) { + // if p.second is false, this pair was not inserted because + // a computation for the same computation-request already existed in + // the map. This is possible in multi-threaded operations, if two + // threads try to compile the same computation at the same time (only + // one of them will successfully add it). + // We need to erase the access-queue element that we just added, it's + // no longer going to be needed. + access_queue_.erase(ait); + delete request; + } + return computation; +} + + +void ComputationCache::Read(std::istream &is, bool binary) { + // Note: the object on disk doesn't have tokens like "" + // and "" for back-compatibility reasons. + int32 computation_cache_size; + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &computation_cache_size); + KALDI_ASSERT(computation_cache_size >= 0); + computation_cache_.clear(); + access_queue_.clear(); + ExpectToken(is, binary, ""); + for (size_t c = 0; c < computation_cache_size; c++) { + ComputationRequest request; + request.Read(is, binary); + NnetComputation *computation = new NnetComputation(); + computation->Read(is, binary); + Insert(request, computation); + } +} + +void ComputationCache::Check(const Nnet &nnet) const { + CacheType::const_iterator iter = computation_cache_.begin(), + end = computation_cache_.end(); + // We only need to explicitly delete the pointer to the ComputationRequest. + // The pointers to Computation are deleted automatically by std::shared_ptr + // when the reference count goes to zero. + for (; iter != end; ++iter) { + const NnetComputation &computation = *(iter->second.first); + CheckComputationOptions check_config; + ComputationChecker checker(check_config, nnet, computation); + checker.Check(); + } +} + +void ComputationCache::Write(std::ostream &os, bool binary) const { + WriteToken(os, binary, ""); + WriteBasicType(os, binary, static_cast(computation_cache_.size())); + WriteToken(os, binary, ""); + for (CacheType::const_iterator iter = computation_cache_.begin(); + iter != computation_cache_.end(); ++iter) { + iter->first->Write(os, binary); + iter->second.first->Write(os, binary); + } +} + +ComputationCache::~ComputationCache() { + CacheType::const_iterator iter = computation_cache_.begin(), + end = computation_cache_.end(); + // We only need to explicitly delete the pointer to the ComputationRequest. + // The pointers to Computation are deleted automatically by std::shared_ptr + // when the reference count goes to zero. + for (; iter != end; ++iter) + delete iter->first; +} + } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h index 32adf9e3e19..0a30dcc84cb 100644 --- a/src/nnet3/nnet-optimize-utils.h +++ b/src/nnet3/nnet-optimize-utils.h @@ -20,9 +20,12 @@ #ifndef KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_ #define KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_ +#include +#include #include "nnet3/nnet-compile.h" #include "nnet3/nnet-analyze.h" + namespace kaldi { namespace nnet3 { @@ -613,16 +616,67 @@ void OptimizeLoopedComputation(const Nnet &nnet, void FixGotoLabel(NnetComputation *computation); -/* +/// Class ComputationCache is used inside class CachingOptimizingCompiler to +/// cache previously computed computations. The code was moved from class +/// CachingOptimizingCompiler to this separate class for clarity when adding +/// thread-safety functionality. +/// It's OK to call Find() and Insert() from multiple threads without +/// additional synchronization. +class ComputationCache { + public: + ComputationCache(int32 cache_capacity); - Possible TODO: - optimizations to replace row-by-row copy and add commands with whole-matrix - commands on smaller sub-matrices (if the row-by-row copy commands have certain - regularities). this is a minor issue, we can handle it later. We have to be - careful if this causes sub-matrices to overlap. + // Note: if something fails in Read(), or the written cache was from an older + // format, it will just leave the cache empty. + void Read(std::istream &is, bool binary); + + void Write(std::ostream &os, bool binary) const; + + + // Searches for the computation corresponding to this computation, and returns + // it if cached, or NULL (as std::shared_ptr) if not. (We need shared_ptr to + // handle multi-threaded operation, so that if the computation is ejected from + // the cache by another thread, it won't be deleted while still in use). This + // function also moves this computation to the end of the + // most-recently-accessed queue, which is why it's not const. + std::shared_ptr Find(const ComputationRequest &request); - */ + // Inserts the computation into the cache-- this is assumed to be the + // computation for the computation-request 'request'. Returns a shared_ptr + // which can be used to access the object. This function takes ownership of + // 'computation'. + std::shared_ptr Insert(const ComputationRequest &request, + const NnetComputation *computation); + + ~ComputationCache(); + + // Checks the stored computation for correctness. + void Check(const Nnet &nnet) const; + private: + + std::mutex mutex_; // Read/write mutex. + + int32 cache_capacity_; + + // The access queue for keeping track of the freshness of computation. + // Most-recently-accessed computation is at the end, and + // least-recently-accessed computaiton is at the beginning. Together with + // computation_cache_, this forms a most-recently-used (MRU) cache for + // Computations, indexed by ComputationRequest. The pointers are owned in + // computation_cache_. + typedef std::list AqType; + AqType access_queue_; + + // Map from computation-request to pair of (computation, and position in + // access_queue_). Used for fast lookup of previously compiled computations. + // All pointers are owned here. + typedef unordered_map, AqType::iterator>, + ComputationRequestHasher, + ComputationRequestPtrEqual> CacheType; + CacheType computation_cache_; +}; diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc index ecce196801b..63a7e833c74 100644 --- a/src/nnet3/nnet-optimize.cc +++ b/src/nnet3/nnet-optimize.cc @@ -36,21 +36,19 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &propagate_in_place); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &backprop_in_place); - std::string tok; - ReadToken(is, binary, &tok); - if (tok == "") { + if (PeekToken(is, binary) == 'O') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &optimize_row_ops); - ReadToken(is, binary, &tok); - } else { - optimize_row_ops = true; } - if (tok == "") { + if (PeekToken(is, binary) == 'S') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &split_row_ops); - ReadToken(is, binary, &tok); - } else { - split_row_ops = true; } - KALDI_ASSERT(tok == ""); + if (PeekToken(is, binary) == 'E') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &extend_matrices); + } + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &convert_addition); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &remove_assignments); @@ -68,14 +66,19 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) { ReadBasicType(is, binary, &min_deriv_time); ExpectToken(is, binary, ""); ReadBasicType(is, binary, &max_deriv_time); - ReadToken(is, binary, &tok); - if (tok == "") { + if (PeekToken(is, binary) == 'M') { + ExpectToken(is, binary, ""); ReadBasicType(is, binary, &max_deriv_time_relative); - ReadToken(is, binary, &tok); } - - - KALDI_ASSERT(tok == ""); + if (PeekToken(is, binary) == 'S') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &snip_row_ops); + } + if (PeekToken(is, binary) == 'M') { + ExpectToken(is, binary, ""); + ReadBasicType(is, binary, &memory_compression_level); + } + ExpectToken(is, binary, ""); } void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const { @@ -90,6 +93,10 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, backprop_in_place); WriteToken(os, binary, ""); WriteBasicType(os, binary, optimize_row_ops); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, split_row_ops); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, extend_matrices); WriteToken(os, binary, ""); WriteBasicType(os, binary, convert_addition); WriteToken(os, binary, ""); @@ -110,14 +117,20 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const { WriteBasicType(os, binary, max_deriv_time); WriteToken(os, binary, ""); WriteBasicType(os, binary, max_deriv_time_relative); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, snip_row_ops); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, memory_compression_level); WriteToken(os, binary, ""); } bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const { - return (other.propagate_in_place == propagate_in_place && - other.optimize == optimize && + return (other.optimize == optimize && other.consolidate_model_update == consolidate_model_update && + other.propagate_in_place == propagate_in_place && other.backprop_in_place == backprop_in_place && + other.optimize_row_ops == optimize_row_ops && + other.split_row_ops == split_row_ops && other.convert_addition == convert_addition && other.remove_assignments == remove_assignments && other.allow_left_merge == allow_left_merge && @@ -127,7 +140,9 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const { other.allocate_from_other == allocate_from_other && other.min_deriv_time == min_deriv_time && other.max_deriv_time == max_deriv_time && - other.max_deriv_time_relative == max_deriv_time_relative); + other.max_deriv_time_relative == max_deriv_time_relative && + other.snip_row_ops == snip_row_ops && + other.memory_compression_level == memory_compression_level); } // move commands that resize and zero matrices to as late/early as possible. @@ -613,25 +628,6 @@ void Optimize(const NnetOptimizeOptions &config, KALDI_LOG << "After optimization, max memory use (bytes) = " << GetMaxMemoryUse(*computation); } - -} - -// ComputationRequests are distinguished by the names and indexes -// of inputs and outputs -size_t ComputationRequestHasher::operator() ( - const ComputationRequest *cr) const noexcept { - size_t ans = 0; - size_t p1 = 4111, p2 = 26951; - IoSpecificationHasher io_hasher; - std::vector::const_iterator itr = cr->inputs.begin(), - end = cr->inputs.end(); - for (; itr != end; ++itr) - ans = ans * p1 + io_hasher(*itr); - itr = cr->outputs.begin(); - end = cr->outputs.end(); - for (; itr != end; ++itr) - ans = ans * p2 + io_hasher(*itr); - return ans; } @@ -641,7 +637,8 @@ CachingOptimizingCompiler::CachingOptimizingCompiler( nnet_(nnet), config_(config), seconds_taken_total_(0.0), seconds_taken_compile_(0.0), seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0), - seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { } + seconds_taken_check_(0.0), seconds_taken_indexes_(0.0), + seconds_taken_io_(0.0), cache_(config.cache_capacity) { } CachingOptimizingCompiler::CachingOptimizingCompiler( const Nnet &nnet, @@ -650,87 +647,41 @@ CachingOptimizingCompiler::CachingOptimizingCompiler( nnet_(nnet), config_(config), opt_config_(opt_config), seconds_taken_total_(0.0), seconds_taken_compile_(0.0), seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0), - seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { } - -void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request, - const NnetComputation *computation) { - if (computation_cache_.size() == config_.cache_capacity) { - // full, locate the least-recently-accessed request - const CacheType::iterator it = - computation_cache_.find(access_queue_.front()); - KALDI_ASSERT(it != computation_cache_.end()); - // purge the least-recently-accessed request - const ComputationRequest *r = it->first; - const NnetComputation *c = it->second.first; - computation_cache_.erase(it); - delete r; - delete c; - access_queue_.pop_front(); - } - AqType::iterator ait = access_queue_.insert(access_queue_.end(), request); - computation_cache_.insert(std::make_pair(request, - std::make_pair(computation, ait))); -} + seconds_taken_check_(0.0), seconds_taken_indexes_(0.0), + seconds_taken_io_(0.0), cache_(config.cache_capacity) { } + void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) { - NnetOptimizeOptions opt_config_cached; - opt_config_cached.Read(is, binary); - // we won't read cached computations if any optimize option has been changed. - bool read_cache = (opt_config_ == opt_config_cached); - - if (read_cache) { - int32 computation_cache_size; - ExpectToken(is, binary, ""); - ReadBasicType(is, binary, &computation_cache_size); - KALDI_ASSERT(computation_cache_size >= 0); - computation_cache_.clear(); - access_queue_.clear(); - ExpectToken(is, binary, ""); - for (size_t c = 0; c < computation_cache_size; c++) { - ComputationRequest *request = new ComputationRequest(); - request->Read(is, binary); - NnetComputation *computation = new NnetComputation(); - computation->Read(is, binary); - if (GetVerboseLevel() >= 3) { - Timer timer; - CheckComputationOptions check_config; - ComputationChecker checker(check_config, nnet_, *computation); - checker.Check(); - seconds_taken_check_ += timer.Elapsed(); - } - UpdateCache(request, computation); - } + { + Timer timer; + NnetOptimizeOptions opt_config_cached; + opt_config_cached.Read(is, binary); + // we won't read cached computations if any optimize option has been changed. + if (!(opt_config_ == opt_config_cached)) + return; + cache_.Read(is, binary); + seconds_taken_io_ += timer.Elapsed(); + } + if (GetVerboseLevel() >= 2) { + Timer timer; + cache_.Check(nnet_); + seconds_taken_check_ += timer.Elapsed(); + // we consider the check time part of the total time... this is very + // arbitrary but it only affects printed times-taken. + seconds_taken_total_ += timer.Elapsed(); } -} -void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) const { - opt_config_.Write(os, binary); - WriteToken(os, binary, ""); - WriteBasicType(os, binary, static_cast(computation_cache_.size())); - WriteToken(os, binary, ""); - for (CacheType::const_iterator iter = computation_cache_.begin(); - iter != computation_cache_.end(); ++iter) { - iter->first->Write(os, binary); - iter->second.first->Write(os, binary); - } } -void CachingOptimizingCompiler::UpdateAccessQueue(CacheType::iterator &cit) { - // exist, update access record by moving the accessed - // request to the end of the access queue - KALDI_ASSERT(cit != computation_cache_.end()); - access_queue_.splice(access_queue_.end(), access_queue_, - cit->second.second); +void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) { + Timer timer; + opt_config_.Write(os, binary); + cache_.Write(os, binary); + seconds_taken_io_ += timer.Elapsed(); } CachingOptimizingCompiler::~CachingOptimizingCompiler() { - CacheType::const_iterator itr = computation_cache_.begin(), - end = computation_cache_.end(); - for (; itr !=end; ++itr) { - delete itr->first; - delete itr->second.first; - } - if (seconds_taken_total_ > 0.0) { + if (seconds_taken_total_ > 0.0 || seconds_taken_io_ > 0.0) { std::ostringstream os; double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_ - seconds_taken_optimize_ - seconds_taken_expand_ @@ -742,52 +693,40 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() { << seconds_taken_expand_ << " shortcut expansion, " << seconds_taken_check_ << " checking, " << seconds_taken_indexes_ << " computing indexes, " - << seconds_taken_misc << " misc.)"; + << seconds_taken_misc << " misc.) + " + << seconds_taken_io_ << " I/O."; KALDI_LOG << os.str(); // note: the leftover amount is misc things like hashing and == comparisons on // computation-requests, and calling RequestIsDecomposable(). } } -const NnetComputation* CachingOptimizingCompiler::Compile( +std::shared_ptr CachingOptimizingCompiler::Compile( const ComputationRequest &in_request) { Timer timer; - const NnetComputation *ans = CompileInternal(in_request); + std::shared_ptr ans = CompileInternal(in_request); seconds_taken_total_ += timer.Elapsed(); return ans; } -const NnetComputation* CachingOptimizingCompiler::CompileInternal( - const ComputationRequest &in_request) { - const NnetComputation *ans; - // find computation in the cache - CacheType::iterator cit = computation_cache_.find(&in_request); - if (cit == computation_cache_.end()) { - ans = CompileAndCache(in_request); +std::shared_ptr CachingOptimizingCompiler::CompileInternal( + const ComputationRequest &request) { + std::shared_ptr ans = cache_.Find(request); + if (ans != NULL) { + return ans; } else { - // if found, update access queue - const NnetComputation *computation = cit->second.first; - UpdateAccessQueue(cit); - ans = computation; + const NnetComputation *computation = NULL; + if (config_.use_shortcut) + computation = CompileViaShortcut(request); + if (computation == NULL) + computation = CompileNoShortcut(request); + KALDI_ASSERT(computation != NULL); + return cache_.Insert(request, computation); } - return ans; } -const NnetComputation* CachingOptimizingCompiler::CompileAndCache( - const ComputationRequest &in_request) { - // we need to make a copy of ComputationRequest, because it's stored - // as the key in the cache, and we need to own the pointer. - ComputationRequest *request = new ComputationRequest(in_request); - - const NnetComputation *computation = CompileViaShortcut(*request); - if (computation == NULL) - computation = CompileNoShortcut(*request); - UpdateCache(request, computation); - return computation; -} - -const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut( +const NnetComputation *CachingOptimizingCompiler::CompileNoShortcut( const ComputationRequest &request) { Compiler compiler(request, nnet_); @@ -831,12 +770,12 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut( seconds_taken_optimize_ += timer.Elapsed(); } - if (GetVerboseLevel() >= verbose_cutoff) { std::ostringstream os; computation->Print(os, nnet_); KALDI_LOG << "Optimized computation is: " << os.str(); } + { // check the computation again. Timer timer; CheckComputationOptions check_config; @@ -844,6 +783,7 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut( checker.Check(); seconds_taken_check_ += timer.Elapsed(); } + { Timer timer; computation->ComputeCudaIndexes(); @@ -853,22 +793,17 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut( } -const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut( +const NnetComputation *CachingOptimizingCompiler::CompileViaShortcut( const ComputationRequest &request) { - if (!config_.use_shortcut) - return NULL; - int32 num_n_values; ComputationRequest mini_request; if (!RequestIsDecomposable(request, &mini_request, &num_n_values)) return NULL; // By invoking CompileInternal() on the mini request, we go through the same - // caching process as for any externally requested computation. [the only - // difference from Compile() is that it doesn't call the timer code; this - // avoids double-counting the time taken.] This pointer will not have to be - // deleted by this function; it's owned by the class, in the cache. - const NnetComputation *mini_computation = CompileInternal(mini_request); + // caching process as for any externally requested computation. + std::shared_ptr mini_computation = + CompileInternal(mini_request); // note: by default we always create debug_info, even in regular compilation. // (e.g. it defaults to true in CompilerOptions). If it really seems to be a diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index a07c5490c5c..aaa1182e1b6 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -23,8 +23,7 @@ #include "nnet3/nnet-compile.h" #include "nnet3/nnet-analyze.h" - -#include +#include "nnet3/nnet-optimize-utils.h" namespace kaldi { namespace nnet3 { @@ -34,6 +33,8 @@ namespace nnet3 { // detected, we can work out which optimization was responsible for the error. // See the Register() function below for option-specific documentation. struct NnetOptimizeOptions { + // Caution: if adding or removing members, the Read and Write functions and + // the == operator should be modified. This relates to computation caching. bool optimize; // setting this false disallow all optimization. bool consolidate_model_update; bool propagate_in_place; @@ -186,22 +187,6 @@ void Optimize(const NnetOptimizeOptions &config, int32 max_output_time_in_request, NnetComputation *computation); -// Hash function for ComputationRequest. It converts -// ComputationRequest to hash code by looking at input -// and output IoSpecifications vectors. -struct ComputationRequestHasher { - size_t operator()(const ComputationRequest *cr) const noexcept; -}; - -// Equality function for ComputationRequest pointer -struct ComputationRequestPtrEqual { - public: - bool operator() (const ComputationRequest* cr1, - const ComputationRequest* cr2) const { - return (*cr1) == (*cr2); - } -}; - struct CachingOptimizingCompilerOptions { @@ -229,6 +214,8 @@ struct CachingOptimizingCompilerOptions { /// This class enables you to do the compilation and optimization in one call, /// and also ensures that if the ComputationRequest is identical to the previous /// one, the compilation process is not repeated. +/// It is safe to call Compile() from multiple parallel threads without additional +/// synchronization; synchronization is managed internally by class ComputationCache. class CachingOptimizingCompiler { public: CachingOptimizingCompiler(const Nnet &nnet, @@ -242,29 +229,35 @@ class CachingOptimizingCompiler { CachingOptimizingCompilerOptions()); ~CachingOptimizingCompiler(); - /// Does the compilation and returns a const pointer to - /// the result, which is owned by this class, not the caller. - /// It calls ComputeCudaIndexes() for you, because you wouldn't - /// be able to do this on a const object. - const NnetComputation* Compile(const ComputationRequest &request); + + /// Does the compilation and returns a const pointer to the result, which is + /// owned by this class, not the caller. It calls ComputeCudaIndexes() for + /// you, because you wouldn't be able to do this on a const object. + /// + /// Note: this used to return 'const NnetComputation*'. If you get a + /// compilation failure, just replace 'const NnetComputation*' with + /// 'std::shared_ptr' in the calling code. + std::shared_ptr Compile( + const ComputationRequest &request); void ReadCache(std::istream &is, bool binary); - void WriteCache(std::ostream &os, bool binary) const; + void WriteCache(std::ostream &os, bool binary); + private: // This function just implements the work of Compile(); it's made a separate // function for the convenience of the timer code, to avoid it being called // twice (we also call this function directly from inside the class). - const NnetComputation* CompileInternal(const ComputationRequest &request); + std::shared_ptr CompileInternal(const ComputationRequest &request); // This function, called from CompileInternal(), is called when a // ComputationRequest has been determined not to have already been cached. It // otherwise has the same interface as CompileInternal(), but assumes that // there is nothing cached for this computation as yet. It compiles the // computation and takes care of caching it. - const NnetComputation* CompileAndCache(const ComputationRequest &request); + std::shared_ptr CompileAndCache(const ComputationRequest &request); - // This function, called from CompileAndCache(), tries to compile the + // This function, called from CompileInternal(), tries to compile the // ComputationRequest 'request' via 'shortcut' compilation; if this is // possible, it returns a pointer to a newly allocated computation that it has // compiled this way (note: this computation will not yet have been placed in @@ -273,36 +266,19 @@ class CachingOptimizingCompiler { // request was not decomposable because of too few n values or irregular or // unexpected structure), this function returns NULL and you should compile // via CompileNoShortcut. - const NnetComputation* CompileViaShortcut(const ComputationRequest &request); + const NnetComputation *CompileViaShortcut(const ComputationRequest &request); - // This function, called from CompileAndCache(), tries to compile the + // This function, called from CompileInternal(), tries to compile the // ComputationRequest 'request' via the regular (not shortcut) compilation // process; it returns a pointer to a newly allocated computation that it has // compiled this way (note: this computation will not yet have been placed in // the computation cache). - const NnetComputation* CompileNoShortcut(const ComputationRequest &request); + const NnetComputation *CompileNoShortcut(const ComputationRequest &request); const Nnet &nnet_; CachingOptimizingCompilerOptions config_; NnetOptimizeOptions opt_config_; - // The access queue for keeping track of the freshness of computation. - // Most-recently-accessed computation is at the end, and - // least-recently-accessed computaiton is at the beginning. - // Together with computation_cache_, this forms a most-recently-used (MRU) - // cache for Computations, indexed by ComputationRequest. Pointers - // are owned in computation_cache_. - typedef std::list AqType; - AqType access_queue_; - - // Map from computation-request to pair of (computation, and position in - // access_queue_). Used for fast lookup of previously compiled computations. - // All pointers are owned here. - typedef unordered_map, - ComputationRequestHasher, - ComputationRequestPtrEqual> CacheType; - CacheType computation_cache_; // seconds spent in various phases of compilation-- for diagnostic messages double seconds_taken_total_; @@ -311,15 +287,9 @@ class CachingOptimizingCompiler { double seconds_taken_expand_; double seconds_taken_check_; double seconds_taken_indexes_; + double seconds_taken_io_; - // This function updates the computation cache. It is called within - // CompileInternal(). It takes ownership of the pointers. It inserts the - // request at the end of the queue, and purges the least-recently-accessed - // request from the queue and the cache if the capacity is reached. - void UpdateCache(const ComputationRequest *request, - const NnetComputation *computation); - // This function updates the recently accessed queue. - void UpdateAccessQueue(CacheType::iterator &cit); + ComputationCache cache_; }; diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 812b66c41b1..49222549e4e 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -62,7 +62,7 @@ void NnetTrainer::Train(const NnetExample &eg) { GetComputationRequest(*nnet_, eg, need_model_derivative, config_.store_component_stats, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); if (config_.backstitch_training_scale > 0.0 && num_minibatches_processed_ % config_.backstitch_training_interval == diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc index e649dc477d5..664b15eb246 100644 --- a/src/nnet3bin/nnet3-xvector-compute.cc +++ b/src/nnet3bin/nnet3-xvector-compute.cc @@ -44,7 +44,7 @@ static void RunNnetComputation(const MatrixBase &features, output_spec.indexes.resize(1); request.outputs.resize(1); request.outputs[0].Swap(&output_spec); - const NnetComputation *computation = compiler->Compile(request); + std::shared_ptr computation = compiler->Compile(request); Nnet *nnet_to_update = NULL; // we're not doing any update. NnetComputer computer(NnetComputeOptions(), *computation, nnet, nnet_to_update); diff --git a/src/rnnlm/rnnlm-core-compute.cc b/src/rnnlm/rnnlm-core-compute.cc index d7ec22dc538..f0cf4487c2b 100644 --- a/src/rnnlm/rnnlm-core-compute.cc +++ b/src/rnnlm/rnnlm-core-compute.cc @@ -44,7 +44,7 @@ BaseFloat RnnlmCoreComputer::Compute( store_component_stats, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputeOptions compute_opts; diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc index ddf4e7b3fb6..63a6dee188d 100644 --- a/src/rnnlm/rnnlm-core-training.cc +++ b/src/rnnlm/rnnlm-core-training.cc @@ -156,7 +156,7 @@ void RnnlmCoreTrainer::Train( store_component_stats, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputeOptions compute_opts; @@ -178,12 +178,12 @@ void RnnlmCoreTrainer::Train( word_embedding_deriv->AddSmatMat(1.0, derived.input_words_smat, kNoTrans, input_deriv, 1.0); } - // If relevant, add in the part of the gradient that comes from L2 + // If relevant, add in the part of the gradient that comes from L2 // regularization. ApplyL2Regularization(*nnet_, minibatch.num_chunks * config_.l2_regularize_factor, delta_nnet_); - + bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, 1.0, 1.0 - config_.momentum, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); @@ -214,7 +214,7 @@ void RnnlmCoreTrainer::TrainBackstitch( store_component_stats, &request); - const NnetComputation *computation = compiler_.Compile(request); + std::shared_ptr computation = compiler_.Compile(request); NnetComputeOptions compute_opts; @@ -259,7 +259,7 @@ void RnnlmCoreTrainer::TrainBackstitch( minibatch.num_chunks * config_.l2_regularize_factor, delta_nnet_); } - + UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change, max_change_scale, scale_adding, nnet_, &num_max_change_per_component_applied_, &num_max_change_global_applied_); @@ -309,7 +309,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const { if (num_max_change_global_applied_ > 0) KALDI_LOG << "The global max-change was enforced " << (100.0 * num_max_change_global_applied_) / - (num_minibatches_processed_ * + (num_minibatches_processed_ * (config_.backstitch_training_scale == 0.0 ? 1.0 : 1.0 + 1.0 / config_.backstitch_training_interval)) << "\% of the time."; From f4a5667df60f14314a6fdc92efd28e733fbcf150 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Thu, 22 Mar 2018 16:17:16 -0400 Subject: [PATCH 3/5] [src] Fix to comment --- src/nnet3/nnet-optimize.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h index aaa1182e1b6..78763732469 100644 --- a/src/nnet3/nnet-optimize.h +++ b/src/nnet3/nnet-optimize.h @@ -236,7 +236,7 @@ class CachingOptimizingCompiler { /// /// Note: this used to return 'const NnetComputation*'. If you get a /// compilation failure, just replace 'const NnetComputation*' with - /// 'std::shared_ptr' in the calling code. + /// 'std::shared_ptr' in the calling code. std::shared_ptr Compile( const ComputationRequest &request); void ReadCache(std::istream &is, bool binary); From 9ae3eb78aee0615b28f0a5acec0e6f47635a9765 Mon Sep 17 00:00:00 2001 From: Arseniy Gorin Date: Thu, 22 Mar 2018 23:36:58 +0300 Subject: [PATCH 4/5] [src,scripts] Make cache size configurable for xvector extraction (#2290) --- egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh | 13 ++++++++----- src/nnet3bin/nnet3-xvector-compute.cc | 6 +++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh index 5b8a32b5468..2cfda807c8c 100755 --- a/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh +++ b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh @@ -14,9 +14,11 @@ # Begin configuration section. nj=30 cmd="run.pl" -chunk_size=-1 # The chunk size over which the embedding is extracted. - # If left unspecified, it uses the max_chunk_size in the nnet - # directory. + +cache_capacity=64 # Cache capacity for x-vector extractor +chunk_size=-1 # The chunk size over which the embedding is extracted. + # If left unspecified, it uses the max_chunk_size in the nnet + # directory. use_gpu=false stage=0 @@ -34,6 +36,7 @@ if [ $# != 3 ]; then echo " --use-gpu # If true, use GPU." echo " --nj # Number of jobs" echo " --stage # To control partial reruns" + echo " --cache-capacity # To speed-up xvector extraction" echo " --chunk-size # If provided, extracts embeddings with specified" echo " # chunk size, and averages to produce final embedding" fi @@ -77,13 +80,13 @@ if [ $stage -le 0 ]; then if $use_gpu; then for g in $(seq $nj); do $cmd --gpu 1 ${dir}/log/extract.$g.log \ - nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \ + nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \ "$nnet" "`echo $feat | sed s/JOB/$g/g`" ark,scp:${dir}/xvector.$g.ark,${dir}/xvector.$g.scp || exit 1 & done wait else $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \ - nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \ + nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \ "$nnet" "$feat" ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp || exit 1; fi fi diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc index 664b15eb246..33edc8c9fa6 100644 --- a/src/nnet3bin/nnet3-xvector-compute.cc +++ b/src/nnet3bin/nnet3-xvector-compute.cc @@ -91,6 +91,8 @@ int main(int argc, char *argv[]) { Timer timer; NnetSimpleComputationOptions opts; + CachingOptimizingCompilerOptions compiler_config; + opts.acoustic_scale = 1.0; // by default do no scaling in this recipe. std::string use_gpu = "no"; @@ -98,6 +100,8 @@ int main(int argc, char *argv[]) { min_chunk_size = 100; opts.Register(&po); + compiler_config.Register(&po); + po.Register("use-gpu", &use_gpu, "yes|no|optional|wait, only has effect if compiled with CUDA"); po.Register("chunk-size", &chunk_size, @@ -127,7 +131,7 @@ int main(int argc, char *argv[]) { SetDropoutTestMode(true, &nnet); CollapseModel(CollapseModelConfig(), &nnet); - CachingOptimizingCompiler compiler(nnet, opts.optimize_config); + CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config); BaseFloatVectorWriter vector_writer(vector_wspecifier); From 6dbe7909dd27204d02bf87a96e3ba6d8dd0f7a6c Mon Sep 17 00:00:00 2001 From: Zhehuai Chen Date: Thu, 22 Mar 2018 21:13:02 -0400 Subject: [PATCH 5/5] [src] Speedup and code simplification for chain supervision merging (etc.) (#2302) --- src/chain/chain-supervision-test.cc | 40 +++++++---------- src/chain/chain-supervision.cc | 56 +++++++++--------------- src/chain/chain-supervision.h | 7 ++- src/nnet3/discriminative-supervision.cc | 41 ++++++----------- src/nnet3/discriminative-supervision.h | 3 +- src/nnet3/nnet-chain-example.cc | 9 +--- src/nnet3/nnet-discriminative-example.cc | 9 +--- 7 files changed, 58 insertions(+), 107 deletions(-) diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc index d14c80cd84f..f93fa3aef7b 100644 --- a/src/chain/chain-supervision-test.cc +++ b/src/chain/chain-supervision-test.cc @@ -157,17 +157,11 @@ void TestSupervisionAppend(const TransitionModel &trans_model, std::vector input(num_append); for (int32 i = 0; i < num_append; i++) input[i] = &supervision; - std::vector output; - bool compactify = (RandInt(0, 1) == 0); - AppendSupervision(input, compactify, &output); - if (compactify) { - KALDI_ASSERT(output.size() == 1 && - output[0].frames_per_sequence == - supervision.frames_per_sequence && - output[0].num_sequences == num_append); - } else { - KALDI_ASSERT(output.size() == input.size()); - } + Supervision output; + AppendSupervision(input, &output); + KALDI_ASSERT(output.frames_per_sequence == + supervision.frames_per_sequence && + output.num_sequences == num_append); int32 tot_sequences_in = 0, tot_sequences_out = 0, tot_frames_in = 0, tot_frames_out = 0; for (int32 i = 0; i < num_append; i++) { @@ -175,17 +169,15 @@ void TestSupervisionAppend(const TransitionModel &trans_model, tot_frames_in += input[i]->num_sequences * input[i]->frames_per_sequence; } - for (int32 i = 0; i < output.size(); i++) { - tot_sequences_out += output[i].num_sequences; - tot_frames_out += output[i].num_sequences * - output[i].frames_per_sequence; - } + tot_sequences_out += output.num_sequences; + tot_frames_out += output.num_sequences * + output.frames_per_sequence; KALDI_ASSERT(tot_sequences_out == tot_sequences_in && tot_frames_out == tot_frames_in); - TestSupervisionIo(output[0]); - TestSupervisionNumerator(output[0]); - output[0].Check(trans_model); + TestSupervisionIo(output); + TestSupervisionNumerator(output); + output.Check(trans_model); } void TestSupervisionReattached(const TransitionModel &trans_model, @@ -368,18 +360,16 @@ void TestSupervisionSplitting(const ContextDependency &ctx_dep, TestSupervisionIo(split_supervision[RandInt(0, num_ranges - 1)]); TestSupervisionFrames(split_supervision[RandInt(0, num_ranges - 1)]); - std::vector reattached_supervision; + Supervision reattached_supervision; std::vector to_append(num_ranges); for (int32 i = 0; i < num_ranges; i++) to_append[i] = &(split_supervision[i]); - bool compactify = true; - AppendSupervision(to_append, compactify, &reattached_supervision); - KALDI_ASSERT(reattached_supervision.size() == 1); - ChainTrainingTest(den_graph, reattached_supervision[0]); + AppendSupervision(to_append, &reattached_supervision); + ChainTrainingTest(den_graph, reattached_supervision); if (num_frames % frames_per_range == 0) { TestSupervisionReattached(trans_model, supervision, - reattached_supervision[0]); + reattached_supervision); } } } diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc index 6b386638771..579e3d7b3e0 100644 --- a/src/chain/chain-supervision.cc +++ b/src/chain/chain-supervision.cc @@ -694,71 +694,57 @@ Supervision::Supervision(const Supervision &other): // This static function is called by AppendSupervision if the supervisions // are end2end. It simply puts all e2e FST's into 1 supervision. void AppendSupervisionE2e(const std::vector &input, - bool compactify, - std::vector *output_supervision) { + Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); KALDI_ASSERT(input[0]->e2e); - output_supervision->clear(); - output_supervision->resize(1); KALDI_ASSERT(input[0]->e2e_fsts.size() == 1); - (*output_supervision)[0] = *(input[0]); + *output_supervision = *(input[0]); for (int32 i = 1; i < input.size(); i++) { - (*output_supervision)[0].num_sequences++; + output_supervision->num_sequences++; KALDI_ASSERT(input[i]->e2e_fsts.size() == 1); KALDI_ASSERT(input[i]->frames_per_sequence == - (*output_supervision)[0].frames_per_sequence); - (*output_supervision)[0].e2e_fsts.push_back(input[i]->e2e_fsts[0]); + output_supervision->frames_per_sequence); + output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]); } } void AppendSupervision(const std::vector &input, - bool compactify, - std::vector *output_supervision) { + Supervision *output_supervision) { KALDI_ASSERT(!input.empty()); int32 label_dim = input[0]->label_dim, num_inputs = input.size(); if (num_inputs == 1) { - output_supervision->resize(1); - (*output_supervision)[0] = *(input[0]); + *output_supervision = *(input[0]); return; } if (input[0]->e2e) { - AppendSupervisionE2e(input, compactify, output_supervision); + AppendSupervisionE2e(input, output_supervision); return; } - std::vector output_was_merged; for (int32 i = 1; i < num_inputs; i++) KALDI_ASSERT(input[i]->label_dim == label_dim && "Trying to append incompatible Supervision objects"); - output_supervision->clear(); - output_supervision->reserve(input.size()); - for (int32 i = 0; i < input.size(); i++) { + *output_supervision = *(input[num_inputs-1]); + for (int32 i = num_inputs - 2; i >= 0; i--) { const Supervision &src = *(input[i]); - if (compactify && !output_supervision->empty() && - output_supervision->back().weight == src.weight && - output_supervision->back().frames_per_sequence == + if (output_supervision->weight == src.weight && + output_supervision->frames_per_sequence == src.frames_per_sequence) { // Combine with current output // append src.fst to output_supervision->fst. - fst::Concat(&output_supervision->back().fst, src.fst); - output_supervision->back().num_sequences++; - output_was_merged.back() = true; + // the complexity here is O(V1 + E1) + fst::Concat(src.fst, &output_supervision->fst); + output_supervision->num_sequences++; } else { - output_supervision->resize(output_supervision->size() + 1); - output_supervision->back() = src; - output_was_merged.push_back(false); - } - } - KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); - for (size_t i = 0; i < output_supervision->size(); i++) { - if (output_was_merged[i]) { - fst::StdVectorFst &out_fst = (*output_supervision)[i].fst; - // The process of concatenation will have introduced epsilons. - fst::RmEpsilon(&out_fst); - SortBreadthFirstSearch(&out_fst); + KALDI_ERR << "Mismatch weight or frames_per_sequence between inputs"; } + } + fst::StdVectorFst &out_fst = output_supervision->fst; + // The process of concatenation will have introduced epsilons. + fst::RmEpsilon(&out_fst); + SortBreadthFirstSearch(&out_fst); } // This static function is called by AddWeightToSupervisionFst if the supervision diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h index 641902e6de0..13866e2aba6 100644 --- a/src/chain/chain-supervision.h +++ b/src/chain/chain-supervision.h @@ -380,15 +380,14 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst, /// This function appends a list of supervision objects to create what will /// usually be a single such object, but if the weights and num-frames are not /// all the same it will only append Supervision objects where successive ones -/// have the same weight and num-frames, and if 'compactify' is true. The -/// normal use-case for this is when you are combining neural-net examples for +/// have the same weight and num-frames. +/// The normal use-case for this is when you are combining neural-net examples for /// training; appending them like this helps to simplify the training process. /// This function will crash if the values of label_dim in the inputs are not /// all the same. void AppendSupervision(const std::vector &input, - bool compactify, - std::vector *output_supervision); + Supervision *output_supervision); /// This function helps you to pseudo-randomly split a sequence of length 'num_frames', diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc index 94a165f4c50..716650f16fa 100644 --- a/src/nnet3/discriminative-supervision.cc +++ b/src/nnet3/discriminative-supervision.cc @@ -400,49 +400,36 @@ void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat, } void AppendSupervision(const std::vector &input, - bool compactify, - std::vector *output_supervision) { + DiscriminativeSupervision *output_supervision) { KALDI_ASSERT(!input.empty()); int32 num_inputs = input.size(); if (num_inputs == 1) { - output_supervision->resize(1); - (*output_supervision)[0] = *(input[0]); + *output_supervision = *(input[0]); return; } - std::vector output_was_merged; - output_supervision->clear(); - output_supervision->reserve(input.size()); - for (int32 i = 0; i < input.size(); i++) { + *output_supervision = *(input[num_inputs-1]); + for (int32 i = num_inputs - 2; i >= 0; i--) { const DiscriminativeSupervision &src = *(input[i]); KALDI_ASSERT(src.num_sequences == 1); - if (compactify && !output_supervision->empty() && - output_supervision->back().weight == src.weight && - output_supervision->back().frames_per_sequence == + if (output_supervision->weight == src.weight && + output_supervision->frames_per_sequence == src.frames_per_sequence) { // Combine with current output // append src.den_lat to output_supervision->den_lat. - fst::Concat(&output_supervision->back().den_lat, src.den_lat); + fst::Concat(src.den_lat, &output_supervision->den_lat); - output_supervision->back().num_ali.insert( - output_supervision->back().num_ali.end(), + output_supervision->num_ali.insert( + output_supervision->num_ali.begin(), src.num_ali.begin(), src.num_ali.end()); - output_supervision->back().num_sequences++; - output_was_merged.back() = true; + output_supervision->num_sequences++; } else { - output_supervision->resize(output_supervision->size() + 1); - output_supervision->back() = src; - output_was_merged.push_back(false); - } - } - KALDI_ASSERT(output_was_merged.size() == output_supervision->size()); - for (size_t i = 0; i < output_supervision->size(); i++) { - if (output_was_merged[i]) { - DiscriminativeSupervision &out_sup = (*output_supervision)[i]; - fst::TopSort(&(out_sup.den_lat)); - out_sup.Check(); + KALDI_ERR << "Mismatch weight or frames_per_sequence between inputs"; } } + DiscriminativeSupervision &out_sup = *output_supervision; + fst::TopSort(&(out_sup.den_lat)); + out_sup.Check(); } } // namespace discriminative diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h index a9d58d120f5..6fd684093e2 100644 --- a/src/nnet3/discriminative-supervision.h +++ b/src/nnet3/discriminative-supervision.h @@ -223,8 +223,7 @@ class DiscriminativeSupervisionSplitter { /// training; appending them like this helps to simplify the training process. void AppendSupervision(const std::vector &input, - bool compactify, - std::vector *output_supervision); + DiscriminativeSupervision *output_supervision); } // namespace discriminative diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc index 351312fb952..aad5f83bc80 100644 --- a/src/nnet3/nnet-chain-example.cc +++ b/src/nnet3/nnet-chain-example.cc @@ -204,15 +204,10 @@ static void MergeSupervision( input_supervision.reserve(inputs.size()); for (int32 n = 0; n < num_inputs; n++) input_supervision.push_back(&(inputs[n]->supervision)); - std::vector output_supervision; - bool compactify = true; + chain::Supervision output_supervision; AppendSupervision(input_supervision, - compactify, &output_supervision); - if (output_supervision.size() != 1) - KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths " - << "or weights?"; - output->supervision.Swap(&(output_supervision[0])); + output->supervision.Swap(&output_supervision); output->indexes.clear(); output->indexes.reserve(num_indexes); diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc index 61a9669fb76..29eb65c30b1 100644 --- a/src/nnet3/nnet-discriminative-example.cc +++ b/src/nnet3/nnet-discriminative-example.cc @@ -196,15 +196,10 @@ void MergeSupervision( input_supervision.reserve(inputs.size()); for (int32 n = 0; n < num_inputs; n++) input_supervision.push_back(&(inputs[n]->supervision)); - std::vector output_supervision; - bool compactify = true; + discriminative::DiscriminativeSupervision output_supervision; discriminative::AppendSupervision(input_supervision, - compactify, &output_supervision); - if (output_supervision.size() != 1) - KALDI_ERR << "Failed to merge discriminative examples-- inconsistent lengths " - << "or weights?"; - output->supervision.Swap(&(output_supervision[0])); + output->supervision.Swap(&(output_supervision)); output->indexes.clear(); output->indexes.reserve(num_indexes);