From 954e13cb5fbca65bcf294fe9c96cfc39df226a59 Mon Sep 17 00:00:00 2001
From: Xiaohui Zhang <samuelzhang1104@gmail.com>
Date: Thu, 22 Mar 2018 14:03:39 -0400
Subject: [PATCH 1/5] [src] Make ClipGradientComponent zero gradients when
 clipping threshold is zero (#2301)

---
 src/nnet3/nnet-simple-component.cc |  6 +++++-
 src/nnet3/nnet-simple-component.h  | 29 +++++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index 6c766fc5df1..f9f286aaed2 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -1,8 +1,10 @@
 // nnet3/nnet-simple-component.cc
 
 // Copyright      2015  Johns Hopkins University (author: Daniel Povey)
+//                2015  Xiaohui Zhang
 //                2015  Guoguo Chen
 //                2015  Daniel Galvez
+//                2016  Yiming Wang
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -642,11 +644,13 @@ void ClipGradientComponent::Backprop(const std::string &debug_info,
       to_update->num_backpropped_ += 1;
       RepairGradients(debug_info, in_value, in_deriv, to_update);
     }
+  } else if (clipping_threshold_ == 0.0) {
+    in_deriv->SetZero();
   }
 }
 
 // This function will add a self-repair term to in-deriv, attempting to shrink
-// the maginitude of the input towards self_repair_target_.
+// the magnitude of the input towards self_repair_target_.
 // This term is proportional to [-(input vector - self_repair_target_)].
 // The avarage magnitude of this term is equal to
 // [self_repair_scale_ * clipped_proportion * average norm of input derivative].
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index 7219361a358..9d438678f5d 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -1221,10 +1221,31 @@ class SumBlockComponent: public Component {
 };
 
 
-// ClipGradientComponent just duplicates its input, but clips gradients
-// during backpropagation if they cross a predetermined threshold.
-// This component will be used to prevent gradient explosion problem in
-// recurrent neural networks
+/*
+ ClipGradientComponent just duplicates its input, but clips gradients
+ during backpropagation if they cross a predetermined threshold.
+ This component will be used to prevent gradient explosion problem in
+ recurrent neural networks.
+
+   Configuration values accepted:
+      dim                   Dimension of this component, e.g. 1024
+      clipping-threshold    Threshold to be used for clipping. It could correspond
+                            to max-row-norm (if norm_based_clipping_ == true) or
+                            max-absolute-value (otherwise).
+      norm-based-clipping   If true, the max-row-norm will be clipped. Else element-wise
+                            absolute value clipping is done.
+      self-repair-clipped-proportion-threshold  The threshold of clipped-proportion
+                            for self-repair mechanism to be activated. The self-repair mechanism
+                            adds a term (proportional to [-(input vector - self_repair_target_)])
+                            to in-deriv, attempting to shrink the maginitude of the input towards
+                            self_repair_target_ (e.g. 0.0 or 0.5). The default value is 1.0.
+      self-repair-target    The target value towards which self-repair is trying to set
+                            for in-deriv. The default value is 0.0.
+      self-repair-scale     Scale for the self-repair mechanism; see comments above.
+                            The default value is 0.0, but we usually set this to 1.0e-05 (or
+                            occasionally 1.0e-04) in the scripts.
+*/
+
 class ClipGradientComponent: public Component {
  public:
   ClipGradientComponent(int32 dim, BaseFloat clipping_threshold,

From dbd513c70db8d13c20c1291d25b21799c3cdb453 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 22 Mar 2018 15:44:29 -0400
Subject: [PATCH 2/5] [src] Make CachingOptimizingCompiler thread safe.  Thx:
 Arseniy Gorin (#2288)

---
 src/nnet3/natural-gradient-online.h          |   1 -
 src/nnet3/nnet-am-decodable-simple.cc        |   2 +-
 src/nnet3/nnet-chain-diagnostics.cc          |   2 +-
 src/nnet3/nnet-chain-training.cc             |   2 +-
 src/nnet3/nnet-component-itf.h               |   1 -
 src/nnet3/nnet-computation.cc                |  17 ++
 src/nnet3/nnet-computation.h                 |  16 ++
 src/nnet3/nnet-diagnostics.cc                |  20 +-
 src/nnet3/nnet-discriminative-diagnostics.cc |   3 +-
 src/nnet3/nnet-discriminative-training.cc    |   2 +-
 src/nnet3/nnet-optimize-utils.cc             | 121 ++++++++++
 src/nnet3/nnet-optimize-utils.h              |  68 +++++-
 src/nnet3/nnet-optimize.cc                   | 237 +++++++------------
 src/nnet3/nnet-optimize.h                    |  80 ++-----
 src/nnet3/nnet-training.cc                   |   2 +-
 src/nnet3bin/nnet3-xvector-compute.cc        |   2 +-
 src/rnnlm/rnnlm-core-compute.cc              |   2 +-
 src/rnnlm/rnnlm-core-training.cc             |  12 +-
 18 files changed, 350 insertions(+), 240 deletions(-)

diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h
index 0b05948977e..b49769da540 100644
--- a/src/nnet3/natural-gradient-online.h
+++ b/src/nnet3/natural-gradient-online.h
@@ -21,7 +21,6 @@
 #define KALDI_NNET3_NATURAL_GRADIENT_ONLINE_H_
 
 #include <iostream>
-#include <mutex>
 #include "base/kaldi-common.h"
 #include "matrix/matrix-lib.h"
 #include "cudamatrix/cu-matrix-lib.h"
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index 35b1506336e..341f87f0372 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -248,7 +248,7 @@ void DecodableNnetSimple::DoNnetComputation(
   request.outputs.resize(1);
   request.outputs[0].Swap(&output_spec);
 
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
   Nnet *nnet_to_update = NULL;  // we're not doing any update.
   NnetComputer computer(opts_.compute_config, *computation,
                         nnet_, nnet_to_update);
diff --git a/src/nnet3/nnet-chain-diagnostics.cc b/src/nnet3/nnet-chain-diagnostics.cc
index 084b33347df..a7e60a5e0c4 100644
--- a/src/nnet3/nnet-chain-diagnostics.cc
+++ b/src/nnet3/nnet-chain-diagnostics.cc
@@ -100,7 +100,7 @@ void NnetChainComputeProb::Compute(const NnetChainExample &chain_eg) {
   GetChainComputationRequest(nnet_, chain_eg, need_model_derivative,
                              store_component_stats, use_xent_regularization,
                              use_xent_derivative, &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
   NnetComputer computer(nnet_config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc
index 844fb82d32a..1d149b6f193 100644
--- a/src/nnet3/nnet-chain-training.cc
+++ b/src/nnet3/nnet-chain-training.cc
@@ -68,7 +68,7 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) {
                              nnet_config.store_component_stats,
                              use_xent_regularization, need_model_derivative,
                              &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   if (nnet_config.backstitch_training_scale > 0.0 && num_minibatches_processed_
       % nnet_config.backstitch_training_interval ==
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 79a1f1a5602..01697353308 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -23,7 +23,6 @@
 #define KALDI_NNET3_NNET_COMPONENT_ITF_H_
 
 #include <iostream>
-#include <mutex>
 #include "nnet3/nnet-common.h"
 #include "nnet3/nnet-parse.h"
 #include "base/kaldi-error.h"
diff --git a/src/nnet3/nnet-computation.cc b/src/nnet3/nnet-computation.cc
index bb0e7c917fc..520d296ee21 100644
--- a/src/nnet3/nnet-computation.cc
+++ b/src/nnet3/nnet-computation.cc
@@ -1199,6 +1199,23 @@ size_t IoSpecificationHasher::operator () (
       (io_spec.has_deriv ? 4261 : 0);
 }
 
+// ComputationRequests are distinguished by the names and indexes
+// of inputs and outputs
+size_t ComputationRequestHasher::operator() (
+    const ComputationRequest *cr) const noexcept {
+  size_t ans = 0;
+  size_t p1 = 4111, p2 = 26951;
+  IoSpecificationHasher io_hasher;
+  std::vector<IoSpecification>::const_iterator itr = cr->inputs.begin(),
+                                               end = cr->inputs.end();
+  for (; itr != end; ++itr)
+    ans = ans * p1 + io_hasher(*itr);
+  itr = cr->outputs.begin();
+  end = cr->outputs.end();
+  for (; itr != end; ++itr)
+    ans = ans * p2 + io_hasher(*itr);
+  return ans;
+}
 
 
 
diff --git a/src/nnet3/nnet-computation.h b/src/nnet3/nnet-computation.h
index aefcb94c465..97d8b9045ea 100644
--- a/src/nnet3/nnet-computation.h
+++ b/src/nnet3/nnet-computation.h
@@ -157,6 +157,22 @@ struct ComputationRequest {
   bool operator== (const ComputationRequest &other) const;
 };
 
+// Hash function for ComputationRequest. It converts
+// ComputationRequest to hash code by looking at input
+// and output IoSpecifications vectors.
+struct ComputationRequestHasher {
+  size_t operator()(const ComputationRequest *cr) const noexcept;
+};
+
+// Equality function for ComputationRequest pointer
+struct ComputationRequestPtrEqual {
+ public:
+  bool operator() (const ComputationRequest* cr1,
+                   const ComputationRequest* cr2) const {
+    return (*cr1) == (*cr2);
+  }
+};
+
 
 /**
    CommandType is an enum that describes the category of the command used in
diff --git a/src/nnet3/nnet-diagnostics.cc b/src/nnet3/nnet-diagnostics.cc
index 8f9f1be24e4..2a6cfe5de6a 100644
--- a/src/nnet3/nnet-diagnostics.cc
+++ b/src/nnet3/nnet-diagnostics.cc
@@ -83,7 +83,7 @@ void NnetComputeProb::Compute(const NnetExample &eg) {
   GetComputationRequest(nnet_, eg, need_model_derivative,
                         store_component_stats,
                         &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
   NnetComputer computer(config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
@@ -122,11 +122,11 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
         totals.tot_objective += tot_objf;
       }
       // May not be meaningful in non-classification tasks
-      if (config_.compute_accuracy) {  
+      if (config_.compute_accuracy) {
         BaseFloat tot_weight, tot_accuracy;
         PerDimObjectiveInfo &acc_totals = accuracy_info_[io.name];
 
-        if (config_.compute_per_dim_accuracy && 
+        if (config_.compute_per_dim_accuracy &&
             acc_totals.tot_objective_vec.Dim() == 0) {
           acc_totals.tot_objective_vec.Resize(output.NumCols());
           acc_totals.tot_weight_vec.Resize(output.NumCols());
@@ -134,9 +134,9 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
 
         ComputeAccuracy(io.features, output,
                         &tot_weight, &tot_accuracy,
-                        config_.compute_per_dim_accuracy ? 
+                        config_.compute_per_dim_accuracy ?
                           &acc_totals.tot_weight_vec : NULL,
-                        config_.compute_per_dim_accuracy ? 
+                        config_.compute_per_dim_accuracy ?
                           &acc_totals.tot_objective_vec : NULL);
         acc_totals.tot_weight += tot_weight;
         acc_totals.tot_objective += tot_accuracy;
@@ -149,7 +149,7 @@ void NnetComputeProb::ProcessOutputs(const NnetExample &eg,
 bool NnetComputeProb::PrintTotalStats() const {
   bool ans = false;
   { // First print regular objectives
-    unordered_map<std::string, SimpleObjectiveInfo, 
+    unordered_map<std::string, SimpleObjectiveInfo,
                   StringHasher>::const_iterator iter, end;
     iter = objf_info_.begin();
     end = objf_info_.end();
@@ -168,8 +168,8 @@ bool NnetComputeProb::PrintTotalStats() const {
         ans = true;
     }
   }
-  { 
-    unordered_map<std::string, PerDimObjectiveInfo, 
+  {
+    unordered_map<std::string, PerDimObjectiveInfo,
                   StringHasher>::const_iterator iter, end;
     // now print accuracies.
     iter = accuracy_info_.begin();
@@ -185,14 +185,14 @@ bool NnetComputeProb::PrintTotalStats() const {
         Vector<BaseFloat> accuracy_vec(info.tot_weight_vec.Dim());
         for (size_t j = 0; j < info.tot_weight_vec.Dim(); j++) {
           if (info.tot_weight_vec(j) !=  0) {
-            accuracy_vec(j) = info.tot_objective_vec(j) 
+            accuracy_vec(j) = info.tot_objective_vec(j)
                               / info.tot_weight_vec(j);
           } else {
             accuracy_vec(j) = -1.0;
           }
         }
 
-        KALDI_LOG << "Overall per-dim accuracy vector for '" << name 
+        KALDI_LOG << "Overall per-dim accuracy vector for '" << name
                   << "' is " << accuracy_vec << " per frame"
                   << ", over " << info.tot_weight << " frames.";
       }
diff --git a/src/nnet3/nnet-discriminative-diagnostics.cc b/src/nnet3/nnet-discriminative-diagnostics.cc
index f23af549d72..488372be8e1 100644
--- a/src/nnet3/nnet-discriminative-diagnostics.cc
+++ b/src/nnet3/nnet-discriminative-diagnostics.cc
@@ -78,7 +78,7 @@ void NnetDiscriminativeComputeObjf::Compute(const NnetDiscriminativeExample &eg)
                                       store_component_stats,
                                       use_xent_regularization, use_xent_derivative,
                                       &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
   NnetComputer computer(nnet_config_.compute_config, *computation,
                         nnet_, deriv_nnet_);
   // give the inputs to the computer object.
@@ -206,4 +206,3 @@ const discriminative::DiscriminativeObjectiveInfo* NnetDiscriminativeComputeObjf
 
 } // namespace nnet3
 } // namespace kaldi
-
diff --git a/src/nnet3/nnet-discriminative-training.cc b/src/nnet3/nnet-discriminative-training.cc
index 0a436b69f8c..91a72c73cca 100644
--- a/src/nnet3/nnet-discriminative-training.cc
+++ b/src/nnet3/nnet-discriminative-training.cc
@@ -70,7 +70,7 @@ void NnetDiscriminativeTrainer::Train(const NnetDiscriminativeExample &eg) {
                                       use_xent_regularization,
                                       need_model_derivative,
                                       &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   NnetComputer computer(nnet_config.compute_config, *computation,
                         *nnet_,
diff --git a/src/nnet3/nnet-optimize-utils.cc b/src/nnet3/nnet-optimize-utils.cc
index 756ea45e894..e587c7ff947 100644
--- a/src/nnet3/nnet-optimize-utils.cc
+++ b/src/nnet3/nnet-optimize-utils.cc
@@ -4950,5 +4950,126 @@ void OptimizeMemoryCompression(const Nnet &nnet,
 }
 
 
+std::shared_ptr<const NnetComputation> ComputationCache::Find(
+    const ComputationRequest &in_request) {
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  CacheType::iterator iter = computation_cache_.find(&in_request);
+  if (iter == computation_cache_.end()) {
+    return NULL;
+  } else {
+    std::shared_ptr<const NnetComputation> ans = iter->second.first;
+    // Update access record by moving the accessed request to the end of the
+    // access queue, which declares that it's the most recently used.
+    access_queue_.splice(access_queue_.end(), access_queue_,
+                         iter->second.second);
+    return ans;
+  }
+}
+
+
+ComputationCache::ComputationCache(int32 cache_capacity):
+    cache_capacity_(cache_capacity) {
+  KALDI_ASSERT(cache_capacity > 0);
+}
+
+std::shared_ptr<const NnetComputation> ComputationCache::Insert(
+    const ComputationRequest &request_in,
+    const NnetComputation *computation_in) {
+
+  std::lock_guard<std::mutex> lock(mutex_);
+  if (static_cast<int32>(computation_cache_.size()) >= cache_capacity_) {
+    //  Cache has reached capacity; purge the least-recently-accessed request
+    const CacheType::iterator iter =
+        computation_cache_.find(access_queue_.front());
+    KALDI_ASSERT(iter != computation_cache_.end());
+    const ComputationRequest *request = iter->first;
+    computation_cache_.erase(iter);
+    delete request;
+    // we don't need to delete the computation in iter->second.first, as the
+    // shared_ptr takes care of that automatically.
+    access_queue_.pop_front();
+  }
+
+  // Now insert the thing we need to insert.  We'll own the pointer 'request' in
+  // 'computation_cache_', so we need to allocate our own version.
+  ComputationRequest *request = new ComputationRequest(request_in);
+  // When we construct this shared_ptr, it takes ownership of the pointer
+  // 'computation_in'.
+  std::shared_ptr<const NnetComputation> computation(computation_in);
+
+  AqType::iterator ait = access_queue_.insert(access_queue_.end(), request);
+
+  std::pair<CacheType::iterator, bool> p = computation_cache_.insert(
+      std::make_pair(request, std::make_pair(computation, ait)));
+  if (!p.second) {
+    // if p.second is false, this pair was not inserted because
+    // a computation for the same computation-request already existed in
+    // the map. This is possible in multi-threaded operations, if two
+    // threads try to compile the same computation at the same time (only
+    // one of them will successfully add it).
+    // We need to erase the access-queue element that we just added, it's
+    // no longer going to be needed.
+    access_queue_.erase(ait);
+    delete request;
+  }
+  return computation;
+}
+
+
+void ComputationCache::Read(std::istream &is, bool binary) {
+  // Note: the object on disk doesn't have tokens like "<ComputationCache>"
+  // and "</ComputationCache>" for back-compatibility reasons.
+  int32 computation_cache_size;
+  ExpectToken(is, binary, "<ComputationCacheSize>");
+  ReadBasicType(is, binary, &computation_cache_size);
+  KALDI_ASSERT(computation_cache_size >= 0);
+  computation_cache_.clear();
+  access_queue_.clear();
+  ExpectToken(is, binary, "<ComputationCache>");
+  for (size_t c = 0; c < computation_cache_size; c++) {
+    ComputationRequest request;
+    request.Read(is, binary);
+    NnetComputation *computation = new NnetComputation();
+    computation->Read(is, binary);
+    Insert(request, computation);
+  }
+}
+
+void ComputationCache::Check(const Nnet &nnet) const {
+  CacheType::const_iterator iter = computation_cache_.begin(),
+      end = computation_cache_.end();
+  // We only need to explicitly delete the pointer to the ComputationRequest.
+  // The pointers to Computation are deleted automatically by std::shared_ptr
+  // when the reference count goes to zero.
+  for (; iter != end; ++iter) {
+    const NnetComputation &computation = *(iter->second.first);
+    CheckComputationOptions check_config;
+    ComputationChecker checker(check_config, nnet, computation);
+    checker.Check();
+  }
+}
+
+void ComputationCache::Write(std::ostream &os, bool binary) const {
+  WriteToken(os, binary, "<ComputationCacheSize>");
+  WriteBasicType(os, binary, static_cast<int32>(computation_cache_.size()));
+  WriteToken(os, binary, "<ComputationCache>");
+  for (CacheType::const_iterator iter = computation_cache_.begin();
+           iter != computation_cache_.end(); ++iter) {
+    iter->first->Write(os, binary);
+    iter->second.first->Write(os, binary);
+  }
+}
+
+ComputationCache::~ComputationCache() {
+  CacheType::const_iterator iter = computation_cache_.begin(),
+      end = computation_cache_.end();
+  // We only need to explicitly delete the pointer to the ComputationRequest.
+  // The pointers to Computation are deleted automatically by std::shared_ptr
+  // when the reference count goes to zero.
+  for (; iter != end; ++iter)
+    delete iter->first;
+}
+
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-optimize-utils.h b/src/nnet3/nnet-optimize-utils.h
index 32adf9e3e19..0a30dcc84cb 100644
--- a/src/nnet3/nnet-optimize-utils.h
+++ b/src/nnet3/nnet-optimize-utils.h
@@ -20,9 +20,12 @@
 #ifndef KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_
 #define KALDI_NNET3_NNET_OPTIMIZE_UTILS_H_
 
+#include <mutex>
+#include <list>
 #include "nnet3/nnet-compile.h"
 #include "nnet3/nnet-analyze.h"
 
+
 namespace kaldi {
 namespace nnet3 {
 
@@ -613,16 +616,67 @@ void OptimizeLoopedComputation(const Nnet &nnet,
 void FixGotoLabel(NnetComputation *computation);
 
 
-/*
+/// Class ComputationCache is used inside class CachingOptimizingCompiler to
+/// cache previously computed computations.  The code was moved from class
+/// CachingOptimizingCompiler to this separate class for clarity when adding
+/// thread-safety functionality.
+/// It's OK to call Find() and Insert() from multiple threads without
+/// additional synchronization.
+class ComputationCache {
+ public:
+  ComputationCache(int32 cache_capacity);
 
-   Possible TODO:
-      optimizations to replace row-by-row copy and add commands with whole-matrix
-      commands on smaller sub-matrices (if the row-by-row copy commands have certain
-      regularities).  this is a minor issue, we can handle it later.  We have to be
-      careful if this causes sub-matrices to overlap.
+  // Note: if something fails in Read(), or the written cache was from an older
+  // format, it will just leave the cache empty.
+  void Read(std::istream &is, bool binary);
+
+  void Write(std::ostream &os, bool binary) const;
+
+
+  // Searches for the computation corresponding to this computation, and returns
+  // it if cached, or NULL (as std::shared_ptr) if not.  (We need shared_ptr to
+  // handle multi-threaded operation, so that if the computation is ejected from
+  // the cache by another thread, it won't be deleted while still in use).  This
+  // function also moves this computation to the end of the
+  // most-recently-accessed queue, which is why it's not const.
+  std::shared_ptr<const NnetComputation> Find(const ComputationRequest &request);
 
- */
 
+  // Inserts the computation into the cache-- this is assumed to be the
+  // computation for the computation-request 'request'.  Returns a shared_ptr
+  // which can be used to access the object.  This function takes ownership of
+  // 'computation'.
+  std::shared_ptr<const NnetComputation> Insert(const ComputationRequest &request,
+                                                const NnetComputation *computation);
+
+  ~ComputationCache();
+
+  // Checks the stored computation for correctness.
+  void Check(const Nnet &nnet) const;
+ private:
+
+  std::mutex mutex_;  // Read/write mutex.
+
+  int32 cache_capacity_;
+
+  // The access queue for keeping track of the freshness of computation.
+  // Most-recently-accessed computation is at the end, and
+  // least-recently-accessed computaiton is at the beginning.  Together with
+  // computation_cache_, this forms a most-recently-used (MRU) cache for
+  // Computations, indexed by ComputationRequest. The pointers are owned in
+  // computation_cache_.
+  typedef std::list<const ComputationRequest*> AqType;
+  AqType access_queue_;
+
+  // Map from computation-request to pair of (computation, and position in
+  // access_queue_). Used for fast lookup of previously compiled computations.
+  // All pointers are owned here.
+  typedef unordered_map<const ComputationRequest*,
+                        std::pair<std::shared_ptr<const NnetComputation>, AqType::iterator>,
+                        ComputationRequestHasher,
+                        ComputationRequestPtrEqual> CacheType;
+  CacheType computation_cache_;
+};
 
 
 
diff --git a/src/nnet3/nnet-optimize.cc b/src/nnet3/nnet-optimize.cc
index ecce196801b..63a7e833c74 100644
--- a/src/nnet3/nnet-optimize.cc
+++ b/src/nnet3/nnet-optimize.cc
@@ -36,21 +36,19 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &propagate_in_place);
   ExpectToken(is, binary, "<BackpropInPlace>");
   ReadBasicType(is, binary, &backprop_in_place);
-  std::string tok;
-  ReadToken(is, binary, &tok);
-  if (tok == "<OptimizeRowOps>") {
+  if (PeekToken(is, binary) == 'O') {
+    ExpectToken(is, binary, "<OptimizeRowOps>");
     ReadBasicType(is, binary, &optimize_row_ops);
-    ReadToken(is, binary, &tok);
-  } else {
-    optimize_row_ops = true;
   }
-  if (tok == "<SplitRowOps>") {
+  if (PeekToken(is, binary) == 'S') {
+    ExpectToken(is, binary, "<SplitRowOps>");
     ReadBasicType(is, binary, &split_row_ops);
-    ReadToken(is, binary, &tok);
-  } else {
-    split_row_ops = true;
   }
-  KALDI_ASSERT(tok == "<ConvertAddition>");
+  if (PeekToken(is, binary) == 'E') {
+    ExpectToken(is, binary, "<ExtendMatrices>");
+    ReadBasicType(is, binary, &extend_matrices);
+  }
+  ExpectToken(is, binary, "<ConvertAddition>");
   ReadBasicType(is, binary, &convert_addition);
   ExpectToken(is, binary, "<RemoveAssignments>");
   ReadBasicType(is, binary, &remove_assignments);
@@ -68,14 +66,19 @@ void NnetOptimizeOptions::Read(std::istream &is, bool binary) {
   ReadBasicType(is, binary, &min_deriv_time);
   ExpectToken(is, binary, "<MaxDerivTime>");
   ReadBasicType(is, binary, &max_deriv_time);
-  ReadToken(is, binary, &tok);
-  if (tok == "<MaxDerivTimeRelative>") {
+  if (PeekToken(is, binary) == 'M') {
+    ExpectToken(is, binary, "<MaxDerivTimeRelative>");
     ReadBasicType(is, binary, &max_deriv_time_relative);
-    ReadToken(is, binary, &tok);
   }
-
-
-  KALDI_ASSERT(tok == "</NnetOptimizeOptions>");
+  if (PeekToken(is, binary) == 'S') {
+    ExpectToken(is, binary, "<SnipRowOps>");
+    ReadBasicType(is, binary, &snip_row_ops);
+  }
+  if (PeekToken(is, binary) == 'M') {
+    ExpectToken(is, binary, "<MemoryCompressionLevel>");
+    ReadBasicType(is, binary, &memory_compression_level);
+  }
+  ExpectToken(is, binary, "</NnetOptimizeOptions>");
 }
 
 void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
@@ -90,6 +93,10 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, backprop_in_place);
   WriteToken(os, binary, "<OptimizeRowOps>");
   WriteBasicType(os, binary, optimize_row_ops);
+  WriteToken(os, binary, "<SplitRowOps>");
+  WriteBasicType(os, binary, split_row_ops);
+  WriteToken(os, binary, "<ExtendMatrices>");
+  WriteBasicType(os, binary, extend_matrices);
   WriteToken(os, binary, "<ConvertAddition>");
   WriteBasicType(os, binary, convert_addition);
   WriteToken(os, binary, "<RemoveAssignments>");
@@ -110,14 +117,20 @@ void NnetOptimizeOptions::Write(std::ostream &os, bool binary) const {
   WriteBasicType(os, binary, max_deriv_time);
   WriteToken(os, binary, "<MaxDerivTimeRelative>");
   WriteBasicType(os, binary, max_deriv_time_relative);
+  WriteToken(os, binary, "<SnipRowOps>");
+  WriteBasicType(os, binary, snip_row_ops);
+  WriteToken(os, binary, "<MemoryCompressionLevel>");
+  WriteBasicType(os, binary, memory_compression_level);
   WriteToken(os, binary, "</NnetOptimizeOptions>");
 }
 
 bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
-  return (other.propagate_in_place == propagate_in_place &&
-          other.optimize == optimize &&
+  return (other.optimize == optimize &&
           other.consolidate_model_update == consolidate_model_update &&
+          other.propagate_in_place == propagate_in_place &&
           other.backprop_in_place == backprop_in_place &&
+          other.optimize_row_ops == optimize_row_ops &&
+          other.split_row_ops == split_row_ops &&
           other.convert_addition == convert_addition &&
           other.remove_assignments == remove_assignments &&
           other.allow_left_merge == allow_left_merge &&
@@ -127,7 +140,9 @@ bool NnetOptimizeOptions::operator == (const NnetOptimizeOptions &other) const {
           other.allocate_from_other == allocate_from_other &&
           other.min_deriv_time == min_deriv_time &&
           other.max_deriv_time == max_deriv_time &&
-          other.max_deriv_time_relative == max_deriv_time_relative);
+          other.max_deriv_time_relative == max_deriv_time_relative &&
+          other.snip_row_ops == snip_row_ops &&
+          other.memory_compression_level == memory_compression_level);
 }
 
 // move commands that resize and zero matrices to as late/early as possible.
@@ -613,25 +628,6 @@ void Optimize(const NnetOptimizeOptions &config,
     KALDI_LOG << "After optimization, max memory use (bytes) = "
               << GetMaxMemoryUse(*computation);
   }
-
-}
-
-// ComputationRequests are distinguished by the names and indexes
-// of inputs and outputs
-size_t ComputationRequestHasher::operator() (
-    const ComputationRequest *cr) const noexcept {
-  size_t ans = 0;
-  size_t p1 = 4111, p2 = 26951;
-  IoSpecificationHasher io_hasher;
-  std::vector<IoSpecification>::const_iterator itr = cr->inputs.begin(),
-                                               end = cr->inputs.end();
-  for (; itr != end; ++itr)
-    ans = ans * p1 + io_hasher(*itr);
-  itr = cr->outputs.begin();
-  end = cr->outputs.end();
-  for (; itr != end; ++itr)
-    ans = ans * p2 + io_hasher(*itr);
-  return ans;
 }
 
 
@@ -641,7 +637,8 @@ CachingOptimizingCompiler::CachingOptimizingCompiler(
     nnet_(nnet), config_(config),
     seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
     seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
-    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0),
+    seconds_taken_io_(0.0), cache_(config.cache_capacity) { }
 
 CachingOptimizingCompiler::CachingOptimizingCompiler(
     const Nnet &nnet,
@@ -650,87 +647,41 @@ CachingOptimizingCompiler::CachingOptimizingCompiler(
     nnet_(nnet), config_(config), opt_config_(opt_config),
     seconds_taken_total_(0.0), seconds_taken_compile_(0.0),
     seconds_taken_optimize_(0.0), seconds_taken_expand_(0.0),
-    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0) { }
-
-void CachingOptimizingCompiler::UpdateCache(const ComputationRequest *request,
-                                            const NnetComputation *computation) {
-  if (computation_cache_.size() == config_.cache_capacity) {
-    // full, locate the least-recently-accessed request
-    const CacheType::iterator it =
-        computation_cache_.find(access_queue_.front());
-    KALDI_ASSERT(it != computation_cache_.end());
-    // purge the least-recently-accessed request
-    const ComputationRequest *r = it->first;
-    const NnetComputation *c = it->second.first;
-    computation_cache_.erase(it);
-    delete r;
-    delete c;
-    access_queue_.pop_front();
-  }
-  AqType::iterator ait = access_queue_.insert(access_queue_.end(), request);
-  computation_cache_.insert(std::make_pair(request,
-                            std::make_pair(computation, ait)));
-}
+    seconds_taken_check_(0.0), seconds_taken_indexes_(0.0),
+    seconds_taken_io_(0.0), cache_(config.cache_capacity) { }
+
 
 void CachingOptimizingCompiler::ReadCache(std::istream &is, bool binary) {
-  NnetOptimizeOptions opt_config_cached;
-  opt_config_cached.Read(is, binary);
-  // we won't read cached computations if any optimize option has been changed.
-  bool read_cache = (opt_config_ == opt_config_cached);
-
-  if (read_cache) {
-    int32 computation_cache_size;
-    ExpectToken(is, binary, "<ComputationCacheSize>");
-    ReadBasicType(is, binary, &computation_cache_size);
-    KALDI_ASSERT(computation_cache_size >= 0);
-    computation_cache_.clear();
-    access_queue_.clear();
-    ExpectToken(is, binary, "<ComputationCache>");
-    for (size_t c = 0; c < computation_cache_size; c++) {
-      ComputationRequest *request = new ComputationRequest();
-      request->Read(is, binary);
-      NnetComputation *computation = new NnetComputation();
-      computation->Read(is, binary);
-      if (GetVerboseLevel() >= 3) {
-        Timer timer;
-        CheckComputationOptions check_config;
-        ComputationChecker checker(check_config, nnet_, *computation);
-        checker.Check();
-        seconds_taken_check_ += timer.Elapsed();
-      }
-      UpdateCache(request, computation);
-    }
+  {
+    Timer timer;
+    NnetOptimizeOptions opt_config_cached;
+    opt_config_cached.Read(is, binary);
+    // we won't read cached computations if any optimize option has been changed.
+    if (!(opt_config_ == opt_config_cached))
+      return;
+    cache_.Read(is, binary);
+    seconds_taken_io_ += timer.Elapsed();
+  }
+  if (GetVerboseLevel() >= 2) {
+    Timer timer;
+    cache_.Check(nnet_);
+    seconds_taken_check_ += timer.Elapsed();
+    // we consider the check time part of the total time...  this is very
+    // arbitrary but it only affects printed times-taken.
+    seconds_taken_total_ += timer.Elapsed();
   }
-}
 
-void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) const {
-  opt_config_.Write(os, binary);
-  WriteToken(os, binary, "<ComputationCacheSize>");
-  WriteBasicType(os, binary, static_cast<int32>(computation_cache_.size()));
-  WriteToken(os, binary, "<ComputationCache>");
-  for (CacheType::const_iterator iter = computation_cache_.begin();
-           iter != computation_cache_.end(); ++iter) {
-    iter->first->Write(os, binary);
-    iter->second.first->Write(os, binary);
-  }
 }
 
-void CachingOptimizingCompiler::UpdateAccessQueue(CacheType::iterator &cit) {
-  // exist, update access record by moving the accessed
-  // request to the end of the access queue
-  KALDI_ASSERT(cit != computation_cache_.end());
-  access_queue_.splice(access_queue_.end(), access_queue_,
-                       cit->second.second);
+void CachingOptimizingCompiler::WriteCache(std::ostream &os, bool binary) {
+  Timer timer;
+  opt_config_.Write(os, binary);
+  cache_.Write(os, binary);
+  seconds_taken_io_ += timer.Elapsed();
 }
 
 CachingOptimizingCompiler::~CachingOptimizingCompiler() {
-  CacheType::const_iterator itr = computation_cache_.begin(),
-      end = computation_cache_.end();
-  for (; itr !=end; ++itr) {
-    delete itr->first;
-    delete itr->second.first;
-  }
-  if (seconds_taken_total_ > 0.0) {
+  if (seconds_taken_total_ > 0.0 || seconds_taken_io_ > 0.0) {
     std::ostringstream os;
     double seconds_taken_misc = seconds_taken_total_ - seconds_taken_compile_
         - seconds_taken_optimize_ - seconds_taken_expand_
@@ -742,52 +693,40 @@ CachingOptimizingCompiler::~CachingOptimizingCompiler() {
        << seconds_taken_expand_ << " shortcut expansion, "
        << seconds_taken_check_ << " checking, "
        << seconds_taken_indexes_ << " computing indexes, "
-       << seconds_taken_misc << " misc.)";
+       << seconds_taken_misc << " misc.) + "
+       << seconds_taken_io_ << " I/O.";
     KALDI_LOG << os.str();
     // note: the leftover amount is misc things like hashing and == comparisons on
     // computation-requests, and calling RequestIsDecomposable().
   }
 }
 
-const NnetComputation* CachingOptimizingCompiler::Compile(
+std::shared_ptr<const NnetComputation> CachingOptimizingCompiler::Compile(
     const ComputationRequest  &in_request) {
   Timer timer;
-  const NnetComputation *ans = CompileInternal(in_request);
+  std::shared_ptr<const NnetComputation>  ans = CompileInternal(in_request);
   seconds_taken_total_ += timer.Elapsed();
   return ans;
 }
 
-const NnetComputation* CachingOptimizingCompiler::CompileInternal(
-    const ComputationRequest  &in_request) {
-  const NnetComputation *ans;
-  // find computation in the cache
-  CacheType::iterator cit = computation_cache_.find(&in_request);
-  if (cit == computation_cache_.end()) {
-    ans = CompileAndCache(in_request);
+std::shared_ptr<const NnetComputation> CachingOptimizingCompiler::CompileInternal(
+    const ComputationRequest  &request) {
+  std::shared_ptr<const NnetComputation> ans = cache_.Find(request);
+  if (ans != NULL) {
+    return ans;
   } else {
-    // if found, update access queue
-    const NnetComputation *computation = cit->second.first;
-    UpdateAccessQueue(cit);
-    ans = computation;
+    const NnetComputation *computation = NULL;
+    if (config_.use_shortcut)
+      computation = CompileViaShortcut(request);
+    if (computation == NULL)
+      computation = CompileNoShortcut(request);
+    KALDI_ASSERT(computation != NULL);
+    return cache_.Insert(request, computation);
   }
-  return ans;
 }
 
-const NnetComputation* CachingOptimizingCompiler::CompileAndCache(
-    const ComputationRequest  &in_request) {
-  // we need to make a copy of ComputationRequest, because it's stored
-  // as the key in the cache, and we need to own the pointer.
-  ComputationRequest *request = new ComputationRequest(in_request);
-
-  const NnetComputation *computation = CompileViaShortcut(*request);
-  if (computation == NULL)
-    computation = CompileNoShortcut(*request);
-  UpdateCache(request, computation);
-  return computation;
-}
 
-
-const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
+const NnetComputation *CachingOptimizingCompiler::CompileNoShortcut(
     const ComputationRequest &request) {
 
   Compiler compiler(request, nnet_);
@@ -831,12 +770,12 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
     seconds_taken_optimize_ += timer.Elapsed();
   }
 
-
   if (GetVerboseLevel() >= verbose_cutoff) {
     std::ostringstream os;
     computation->Print(os, nnet_);
     KALDI_LOG << "Optimized computation is: " << os.str();
   }
+
   {  // check the computation again.
     Timer timer;
     CheckComputationOptions check_config;
@@ -844,6 +783,7 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
     checker.Check();
     seconds_taken_check_ += timer.Elapsed();
   }
+
   {
     Timer timer;
     computation->ComputeCudaIndexes();
@@ -853,22 +793,17 @@ const NnetComputation* CachingOptimizingCompiler::CompileNoShortcut(
 }
 
 
-const NnetComputation* CachingOptimizingCompiler::CompileViaShortcut(
+const NnetComputation *CachingOptimizingCompiler::CompileViaShortcut(
     const ComputationRequest &request) {
-  if (!config_.use_shortcut)
-    return NULL;
-
   int32 num_n_values;
   ComputationRequest mini_request;
   if (!RequestIsDecomposable(request, &mini_request, &num_n_values))
     return NULL;
 
   // By invoking CompileInternal() on the mini request, we go through the same
-  // caching process as for any externally requested computation.  [the only
-  // difference from Compile() is that it doesn't call the timer code; this
-  // avoids double-counting the time taken.]  This pointer will not have to be
-  // deleted by this function; it's owned by the class, in the cache.
-  const NnetComputation *mini_computation = CompileInternal(mini_request);
+  // caching process as for any externally requested computation.
+  std::shared_ptr<const NnetComputation> mini_computation =
+      CompileInternal(mini_request);
 
   // note: by default we always create debug_info, even in regular compilation.
   // (e.g. it defaults to true in CompilerOptions).  If it really seems to be a
diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index a07c5490c5c..aaa1182e1b6 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -23,8 +23,7 @@
 
 #include "nnet3/nnet-compile.h"
 #include "nnet3/nnet-analyze.h"
-
-#include <list>
+#include "nnet3/nnet-optimize-utils.h"
 
 namespace kaldi {
 namespace nnet3 {
@@ -34,6 +33,8 @@ namespace nnet3 {
 // detected, we can work out which optimization was responsible for the error.
 // See the Register() function below for option-specific documentation.
 struct NnetOptimizeOptions {
+  // Caution: if adding or removing members, the Read and Write functions and
+  // the == operator should be modified.  This relates to computation caching.
   bool optimize;  // setting this false disallow all optimization.
   bool consolidate_model_update;
   bool propagate_in_place;
@@ -186,22 +187,6 @@ void Optimize(const NnetOptimizeOptions &config,
               int32 max_output_time_in_request,
               NnetComputation *computation);
 
-// Hash function for ComputationRequest. It converts
-// ComputationRequest to hash code by looking at input
-// and output IoSpecifications vectors.
-struct ComputationRequestHasher {
-  size_t operator()(const ComputationRequest *cr) const noexcept;
-};
-
-// Equality function for ComputationRequest pointer
-struct ComputationRequestPtrEqual {
- public:
-  bool operator() (const ComputationRequest* cr1,
-                   const ComputationRequest* cr2) const {
-    return (*cr1) == (*cr2);
-  }
-};
-
 
 
 struct CachingOptimizingCompilerOptions {
@@ -229,6 +214,8 @@ struct CachingOptimizingCompilerOptions {
 /// This class enables you to do the compilation and optimization in one call,
 /// and also ensures that if the ComputationRequest is identical to the previous
 /// one, the compilation process is not repeated.
+/// It is safe to call Compile() from multiple parallel threads without additional
+/// synchronization; synchronization is managed internally by class ComputationCache.
 class CachingOptimizingCompiler {
  public:
   CachingOptimizingCompiler(const Nnet &nnet,
@@ -242,29 +229,35 @@ class CachingOptimizingCompiler {
                             CachingOptimizingCompilerOptions());
 
   ~CachingOptimizingCompiler();
-  /// Does the compilation and returns a const pointer to
-  /// the result, which is owned by this class, not the caller.
-  /// It calls ComputeCudaIndexes() for you, because you wouldn't
-  /// be able to do this on a const object.
-  const NnetComputation* Compile(const ComputationRequest &request);
+
+  /// Does the compilation and returns a const pointer to the result, which is
+  /// owned by this class, not the caller.  It calls ComputeCudaIndexes() for
+  /// you, because you wouldn't be able to do this on a const object.
+  ///
+  /// Note: this used to return 'const NnetComputation*'.  If you get a
+  /// compilation failure, just replace 'const NnetComputation*' with
+  /// 'std::shared_ptr<const NnetComputation*>' in the calling code.
+  std::shared_ptr<const NnetComputation> Compile(
+      const ComputationRequest &request);
   void ReadCache(std::istream &is, bool binary);
-  void WriteCache(std::ostream &os, bool binary) const;
+  void WriteCache(std::ostream &os, bool binary);
+
  private:
 
   // This function just implements the work of Compile(); it's made a separate
   // function for the convenience of the timer code, to avoid it being called
   // twice (we also call this function directly from inside the class).
-  const NnetComputation* CompileInternal(const ComputationRequest &request);
+  std::shared_ptr<const NnetComputation> CompileInternal(const ComputationRequest &request);
 
   // This function, called from CompileInternal(), is called when a
   // ComputationRequest has been determined not to have already been cached.  It
   // otherwise has the same interface as CompileInternal(), but assumes that
   // there is nothing cached for this computation as yet.  It compiles the
   // computation and takes care of caching it.
-  const NnetComputation* CompileAndCache(const ComputationRequest &request);
+  std::shared_ptr<const NnetComputation> CompileAndCache(const ComputationRequest &request);
 
 
-  // This function, called from CompileAndCache(), tries to compile the
+  // This function, called from CompileInternal(), tries to compile the
   // ComputationRequest 'request' via 'shortcut' compilation; if this is
   // possible, it returns a pointer to a newly allocated computation that it has
   // compiled this way (note: this computation will not yet have been placed in
@@ -273,36 +266,19 @@ class CachingOptimizingCompiler {
   // request was not decomposable because of too few n values or irregular or
   // unexpected structure), this function returns NULL and you should compile
   // via CompileNoShortcut.
-  const NnetComputation* CompileViaShortcut(const ComputationRequest &request);
+  const NnetComputation *CompileViaShortcut(const ComputationRequest &request);
 
-  // This function, called from CompileAndCache(), tries to compile the
+  // This function, called from CompileInternal(), tries to compile the
   // ComputationRequest 'request' via the regular (not shortcut) compilation
   // process; it returns a pointer to a newly allocated computation that it has
   // compiled this way (note: this computation will not yet have been placed in
   // the computation cache).
-  const NnetComputation* CompileNoShortcut(const ComputationRequest &request);
+  const NnetComputation *CompileNoShortcut(const ComputationRequest &request);
 
   const Nnet &nnet_;
   CachingOptimizingCompilerOptions config_;
   NnetOptimizeOptions opt_config_;
 
-  // The access queue for keeping track of the freshness of computation.
-  // Most-recently-accessed computation is at the end, and
-  // least-recently-accessed computaiton is at the beginning.
-  // Together with computation_cache_, this forms a most-recently-used (MRU)
-  // cache for Computations, indexed by ComputationRequest. Pointers
-  // are owned in computation_cache_.
-  typedef std::list<const ComputationRequest*> AqType;
-  AqType access_queue_;
-
-  // Map from computation-request to pair of (computation, and position in
-  // access_queue_). Used for fast lookup of previously compiled computations.
-  // All pointers are owned here.
-  typedef unordered_map<const ComputationRequest*,
-                        std::pair<const NnetComputation*, AqType::iterator>,
-                        ComputationRequestHasher,
-                        ComputationRequestPtrEqual> CacheType;
-  CacheType computation_cache_;
 
   // seconds spent in various phases of compilation-- for diagnostic messages
   double seconds_taken_total_;
@@ -311,15 +287,9 @@ class CachingOptimizingCompiler {
   double seconds_taken_expand_;
   double seconds_taken_check_;
   double seconds_taken_indexes_;
+  double seconds_taken_io_;
 
-  // This function updates the computation cache. It is called within
-  // CompileInternal().  It takes ownership of the pointers.  It inserts the
-  // request at the end of the queue, and purges the least-recently-accessed
-  // request from the queue and the cache if the capacity is reached.
-  void UpdateCache(const ComputationRequest *request,
-                   const NnetComputation *computation);
-  // This function updates the recently accessed queue.
-  void UpdateAccessQueue(CacheType::iterator &cit);
+  ComputationCache cache_;
 };
 
 
diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc
index 812b66c41b1..49222549e4e 100644
--- a/src/nnet3/nnet-training.cc
+++ b/src/nnet3/nnet-training.cc
@@ -62,7 +62,7 @@ void NnetTrainer::Train(const NnetExample &eg) {
   GetComputationRequest(*nnet_, eg, need_model_derivative,
                         config_.store_component_stats,
                         &request);
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   if (config_.backstitch_training_scale > 0.0 &&
       num_minibatches_processed_ % config_.backstitch_training_interval ==
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index e649dc477d5..664b15eb246 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -44,7 +44,7 @@ static void RunNnetComputation(const MatrixBase<BaseFloat> &features,
   output_spec.indexes.resize(1);
   request.outputs.resize(1);
   request.outputs[0].Swap(&output_spec);
-  const NnetComputation *computation = compiler->Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler->Compile(request);
   Nnet *nnet_to_update = NULL;  // we're not doing any update.
   NnetComputer computer(NnetComputeOptions(), *computation,
                   nnet, nnet_to_update);
diff --git a/src/rnnlm/rnnlm-core-compute.cc b/src/rnnlm/rnnlm-core-compute.cc
index d7ec22dc538..f0cf4487c2b 100644
--- a/src/rnnlm/rnnlm-core-compute.cc
+++ b/src/rnnlm/rnnlm-core-compute.cc
@@ -44,7 +44,7 @@ BaseFloat RnnlmCoreComputer::Compute(
                              store_component_stats,
                              &request);
 
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   NnetComputeOptions compute_opts;
 
diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc
index ddf4e7b3fb6..63a6dee188d 100644
--- a/src/rnnlm/rnnlm-core-training.cc
+++ b/src/rnnlm/rnnlm-core-training.cc
@@ -156,7 +156,7 @@ void RnnlmCoreTrainer::Train(
                              store_component_stats,
                              &request);
 
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   NnetComputeOptions compute_opts;
 
@@ -178,12 +178,12 @@ void RnnlmCoreTrainer::Train(
     word_embedding_deriv->AddSmatMat(1.0, derived.input_words_smat, kNoTrans,
                                      input_deriv, 1.0);
   }
-  // If relevant, add in the part of the gradient that comes from L2 
+  // If relevant, add in the part of the gradient that comes from L2
   // regularization.
   ApplyL2Regularization(*nnet_,
                         minibatch.num_chunks * config_.l2_regularize_factor,
                         delta_nnet_);
-  
+
   bool success = UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
       1.0, 1.0 - config_.momentum, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
@@ -214,7 +214,7 @@ void RnnlmCoreTrainer::TrainBackstitch(
                              store_component_stats,
                              &request);
 
-  const NnetComputation *computation = compiler_.Compile(request);
+  std::shared_ptr<const NnetComputation> computation = compiler_.Compile(request);
 
   NnetComputeOptions compute_opts;
 
@@ -259,7 +259,7 @@ void RnnlmCoreTrainer::TrainBackstitch(
                           minibatch.num_chunks * config_.l2_regularize_factor,
                           delta_nnet_);
   }
-  
+
   UpdateNnetWithMaxChange(*delta_nnet_, config_.max_param_change,
       max_change_scale, scale_adding, nnet_,
       &num_max_change_per_component_applied_, &num_max_change_global_applied_);
@@ -309,7 +309,7 @@ void RnnlmCoreTrainer::PrintMaxChangeStats() const {
   if (num_max_change_global_applied_ > 0)
     KALDI_LOG << "The global max-change was enforced "
               << (100.0 * num_max_change_global_applied_) /
-                 (num_minibatches_processed_ * 
+                 (num_minibatches_processed_ *
                  (config_.backstitch_training_scale == 0.0 ? 1.0 :
                  1.0 + 1.0 / config_.backstitch_training_interval))
               << "\% of the time.";

From f4a5667df60f14314a6fdc92efd28e733fbcf150 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Thu, 22 Mar 2018 16:17:16 -0400
Subject: [PATCH 3/5] [src] Fix to comment

---
 src/nnet3/nnet-optimize.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-optimize.h b/src/nnet3/nnet-optimize.h
index aaa1182e1b6..78763732469 100644
--- a/src/nnet3/nnet-optimize.h
+++ b/src/nnet3/nnet-optimize.h
@@ -236,7 +236,7 @@ class CachingOptimizingCompiler {
   ///
   /// Note: this used to return 'const NnetComputation*'.  If you get a
   /// compilation failure, just replace 'const NnetComputation*' with
-  /// 'std::shared_ptr<const NnetComputation*>' in the calling code.
+  /// 'std::shared_ptr<const NnetComputation>' in the calling code.
   std::shared_ptr<const NnetComputation> Compile(
       const ComputationRequest &request);
   void ReadCache(std::istream &is, bool binary);

From 9ae3eb78aee0615b28f0a5acec0e6f47635a9765 Mon Sep 17 00:00:00 2001
From: Arseniy Gorin <gorinars@yandex.ru>
Date: Thu, 22 Mar 2018 23:36:58 +0300
Subject: [PATCH 4/5] [src,scripts] Make cache size configurable for xvector
 extraction (#2290)

---
 egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh | 13 ++++++++-----
 src/nnet3bin/nnet3-xvector-compute.cc              |  6 +++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh
index 5b8a32b5468..2cfda807c8c 100755
--- a/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh
+++ b/egs/sre08/v1/sid/nnet3/xvector/extract_xvectors.sh
@@ -14,9 +14,11 @@
 # Begin configuration section.
 nj=30
 cmd="run.pl"
-chunk_size=-1 # The chunk size over which the embedding is extracted.
-              # If left unspecified, it uses the max_chunk_size in the nnet
-              # directory.
+
+cache_capacity=64 # Cache capacity for x-vector extractor
+chunk_size=-1     # The chunk size over which the embedding is extracted.
+                  # If left unspecified, it uses the max_chunk_size in the nnet
+                  # directory.
 use_gpu=false
 stage=0
 
@@ -34,6 +36,7 @@ if [ $# != 3 ]; then
   echo "  --use-gpu <bool|false>                           # If true, use GPU."
   echo "  --nj <n|30>                                      # Number of jobs"
   echo "  --stage <stage|0>                                # To control partial reruns"
+  echo "  --cache-capacity <n|64>                          # To speed-up xvector extraction"
   echo "  --chunk-size <n|-1>                              # If provided, extracts embeddings with specified"
   echo "                                                   # chunk size, and averages to produce final embedding"
 fi
@@ -77,13 +80,13 @@ if [ $stage -le 0 ]; then
   if $use_gpu; then
     for g in $(seq $nj); do
       $cmd --gpu 1 ${dir}/log/extract.$g.log \
-        nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \
+        nnet3-xvector-compute --use-gpu=yes --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \
         "$nnet" "`echo $feat | sed s/JOB/$g/g`" ark,scp:${dir}/xvector.$g.ark,${dir}/xvector.$g.scp || exit 1 &
     done
     wait
   else
     $cmd JOB=1:$nj ${dir}/log/extract.JOB.log \
-      nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size \
+      nnet3-xvector-compute --use-gpu=no --min-chunk-size=$min_chunk_size --chunk-size=$chunk_size --cache-capacity=${cache_capacity} \
       "$nnet" "$feat" ark,scp:${dir}/xvector.JOB.ark,${dir}/xvector.JOB.scp || exit 1;
   fi
 fi
diff --git a/src/nnet3bin/nnet3-xvector-compute.cc b/src/nnet3bin/nnet3-xvector-compute.cc
index 664b15eb246..33edc8c9fa6 100644
--- a/src/nnet3bin/nnet3-xvector-compute.cc
+++ b/src/nnet3bin/nnet3-xvector-compute.cc
@@ -91,6 +91,8 @@ int main(int argc, char *argv[]) {
     Timer timer;
 
     NnetSimpleComputationOptions opts;
+    CachingOptimizingCompilerOptions compiler_config;
+
     opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
 
     std::string use_gpu = "no";
@@ -98,6 +100,8 @@ int main(int argc, char *argv[]) {
       min_chunk_size = 100;
 
     opts.Register(&po);
+    compiler_config.Register(&po);
+
     po.Register("use-gpu", &use_gpu,
       "yes|no|optional|wait, only has effect if compiled with CUDA");
     po.Register("chunk-size", &chunk_size,
@@ -127,7 +131,7 @@ int main(int argc, char *argv[]) {
     SetDropoutTestMode(true, &nnet);
     CollapseModel(CollapseModelConfig(), &nnet);
 
-    CachingOptimizingCompiler compiler(nnet, opts.optimize_config);
+    CachingOptimizingCompiler compiler(nnet, opts.optimize_config, compiler_config);
 
     BaseFloatVectorWriter vector_writer(vector_wspecifier);
 

From 6dbe7909dd27204d02bf87a96e3ba6d8dd0f7a6c Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <chenzhehuai@foxmail.com>
Date: Thu, 22 Mar 2018 21:13:02 -0400
Subject: [PATCH 5/5] [src] Speedup and code simplification for chain
 supervision merging (etc.) (#2302)

---
 src/chain/chain-supervision-test.cc      | 40 +++++++----------
 src/chain/chain-supervision.cc           | 56 +++++++++---------------
 src/chain/chain-supervision.h            |  7 ++-
 src/nnet3/discriminative-supervision.cc  | 41 ++++++-----------
 src/nnet3/discriminative-supervision.h   |  3 +-
 src/nnet3/nnet-chain-example.cc          |  9 +---
 src/nnet3/nnet-discriminative-example.cc |  9 +---
 7 files changed, 58 insertions(+), 107 deletions(-)

diff --git a/src/chain/chain-supervision-test.cc b/src/chain/chain-supervision-test.cc
index d14c80cd84f..f93fa3aef7b 100644
--- a/src/chain/chain-supervision-test.cc
+++ b/src/chain/chain-supervision-test.cc
@@ -157,17 +157,11 @@ void TestSupervisionAppend(const TransitionModel &trans_model,
   std::vector<const Supervision*> input(num_append);
   for (int32 i = 0; i < num_append; i++)
     input[i] = &supervision;
-  std::vector<Supervision> output;
-  bool compactify = (RandInt(0, 1) == 0);
-  AppendSupervision(input, compactify, &output);
-  if (compactify) {
-    KALDI_ASSERT(output.size() == 1 &&
-                 output[0].frames_per_sequence ==
-                 supervision.frames_per_sequence &&
-                 output[0].num_sequences == num_append);
-  } else {
-    KALDI_ASSERT(output.size() == input.size());
-  }
+  Supervision output;
+  AppendSupervision(input, &output);
+  KALDI_ASSERT(output.frames_per_sequence ==
+               supervision.frames_per_sequence &&
+               output.num_sequences == num_append);
   int32 tot_sequences_in = 0, tot_sequences_out = 0,
       tot_frames_in = 0, tot_frames_out = 0;
   for (int32 i = 0; i < num_append; i++) {
@@ -175,17 +169,15 @@ void TestSupervisionAppend(const TransitionModel &trans_model,
     tot_frames_in += input[i]->num_sequences *
         input[i]->frames_per_sequence;
   }
-  for (int32 i = 0; i < output.size(); i++) {
-    tot_sequences_out += output[i].num_sequences;
-    tot_frames_out += output[i].num_sequences *
-        output[i].frames_per_sequence;
-  }
+  tot_sequences_out += output.num_sequences;
+  tot_frames_out += output.num_sequences *
+      output.frames_per_sequence;
   KALDI_ASSERT(tot_sequences_out == tot_sequences_in &&
                tot_frames_out == tot_frames_in);
 
-  TestSupervisionIo(output[0]);
-  TestSupervisionNumerator(output[0]);
-  output[0].Check(trans_model);
+  TestSupervisionIo(output);
+  TestSupervisionNumerator(output);
+  output.Check(trans_model);
 }
 
 void TestSupervisionReattached(const TransitionModel &trans_model,
@@ -368,18 +360,16 @@ void TestSupervisionSplitting(const ContextDependency &ctx_dep,
     TestSupervisionIo(split_supervision[RandInt(0, num_ranges - 1)]);
     TestSupervisionFrames(split_supervision[RandInt(0, num_ranges - 1)]);
 
-    std::vector<Supervision> reattached_supervision;
+    Supervision reattached_supervision;
     std::vector<const Supervision*> to_append(num_ranges);
     for (int32 i = 0; i < num_ranges; i++)
       to_append[i] = &(split_supervision[i]);
-    bool compactify = true;
-    AppendSupervision(to_append, compactify, &reattached_supervision);
-    KALDI_ASSERT(reattached_supervision.size() == 1);
-    ChainTrainingTest(den_graph, reattached_supervision[0]);
+    AppendSupervision(to_append, &reattached_supervision);
+    ChainTrainingTest(den_graph, reattached_supervision);
     if (num_frames % frames_per_range == 0) {
       TestSupervisionReattached(trans_model,
                                 supervision,
-                                reattached_supervision[0]);
+                                reattached_supervision);
     }
   }
 }
diff --git a/src/chain/chain-supervision.cc b/src/chain/chain-supervision.cc
index 6b386638771..579e3d7b3e0 100644
--- a/src/chain/chain-supervision.cc
+++ b/src/chain/chain-supervision.cc
@@ -694,71 +694,57 @@ Supervision::Supervision(const Supervision &other):
 // This static function is called by AppendSupervision if the supervisions
 // are end2end. It simply puts all e2e FST's into 1 supervision.
 void AppendSupervisionE2e(const std::vector<const Supervision*> &input,
-                          bool compactify,
-                          std::vector<Supervision> *output_supervision) {
+                          Supervision *output_supervision) {
   KALDI_ASSERT(!input.empty());
   KALDI_ASSERT(input[0]->e2e);
-  output_supervision->clear();
-  output_supervision->resize(1);
   KALDI_ASSERT(input[0]->e2e_fsts.size() == 1);
-  (*output_supervision)[0] = *(input[0]);
+  *output_supervision = *(input[0]);
   for (int32 i = 1; i < input.size(); i++) {
-    (*output_supervision)[0].num_sequences++;
+    output_supervision->num_sequences++;
     KALDI_ASSERT(input[i]->e2e_fsts.size() == 1);
     KALDI_ASSERT(input[i]->frames_per_sequence ==
-                 (*output_supervision)[0].frames_per_sequence);
-    (*output_supervision)[0].e2e_fsts.push_back(input[i]->e2e_fsts[0]);
+                 output_supervision->frames_per_sequence);
+    output_supervision->e2e_fsts.push_back(input[i]->e2e_fsts[0]);
   }
 }
 
 void AppendSupervision(const std::vector<const Supervision*> &input,
-                       bool compactify,
-                       std::vector<Supervision> *output_supervision) {
+                       Supervision *output_supervision) {
   KALDI_ASSERT(!input.empty());
   int32 label_dim = input[0]->label_dim,
       num_inputs = input.size();
   if (num_inputs == 1) {
-    output_supervision->resize(1);
-    (*output_supervision)[0] = *(input[0]);
+    *output_supervision = *(input[0]);
     return;
   }
   if (input[0]->e2e) {
-    AppendSupervisionE2e(input, compactify, output_supervision);
+    AppendSupervisionE2e(input, output_supervision);
     return;
   }
 
-  std::vector<bool> output_was_merged;
   for (int32 i = 1; i < num_inputs; i++)
     KALDI_ASSERT(input[i]->label_dim == label_dim &&
                  "Trying to append incompatible Supervision objects");
-  output_supervision->clear();
-  output_supervision->reserve(input.size());
-  for (int32 i = 0; i < input.size(); i++) {
+  *output_supervision = *(input[num_inputs-1]);
+  for (int32 i = num_inputs - 2; i >= 0; i--) {
     const Supervision &src = *(input[i]);
-    if (compactify && !output_supervision->empty() &&
-        output_supervision->back().weight == src.weight &&
-        output_supervision->back().frames_per_sequence ==
+    if (output_supervision->weight == src.weight &&
+        output_supervision->frames_per_sequence ==
         src.frames_per_sequence) {
       // Combine with current output
       // append src.fst to output_supervision->fst.
-      fst::Concat(&output_supervision->back().fst, src.fst);
-      output_supervision->back().num_sequences++;
-      output_was_merged.back() = true;
+      // the complexity here is O(V1 + E1)
+      fst::Concat(src.fst, &output_supervision->fst);
+      output_supervision->num_sequences++;
     } else {
-      output_supervision->resize(output_supervision->size() + 1);
-      output_supervision->back() = src;
-      output_was_merged.push_back(false);
-    }
-  }
-  KALDI_ASSERT(output_was_merged.size() == output_supervision->size());
-  for (size_t i = 0; i < output_supervision->size(); i++) {
-    if (output_was_merged[i]) {
-      fst::StdVectorFst &out_fst = (*output_supervision)[i].fst;
-      // The process of concatenation will have introduced epsilons.
-      fst::RmEpsilon(&out_fst);
-      SortBreadthFirstSearch(&out_fst);
+      KALDI_ERR << "Mismatch weight or frames_per_sequence  between inputs";
     }
+
   }
+  fst::StdVectorFst &out_fst = output_supervision->fst;
+  // The process of concatenation will have introduced epsilons.
+  fst::RmEpsilon(&out_fst);
+  SortBreadthFirstSearch(&out_fst);
 }
 
 // This static function is called by AddWeightToSupervisionFst if the supervision
diff --git a/src/chain/chain-supervision.h b/src/chain/chain-supervision.h
index 641902e6de0..13866e2aba6 100644
--- a/src/chain/chain-supervision.h
+++ b/src/chain/chain-supervision.h
@@ -380,15 +380,14 @@ int32 ComputeFstStateTimes(const fst::StdVectorFst &fst,
 /// This function appends a list of supervision objects to create what will
 /// usually be a single such object, but if the weights and num-frames are not
 /// all the same it will only append Supervision objects where successive ones
-/// have the same weight and num-frames, and if 'compactify' is true.  The
-/// normal use-case for this is when you are combining neural-net examples for
+/// have the same weight and num-frames.
+/// The normal use-case for this is when you are combining neural-net examples for
 /// training; appending them like this helps to simplify the training process.
 
 /// This function will crash if the values of label_dim in the inputs are not
 /// all the same.
 void AppendSupervision(const std::vector<const Supervision*> &input,
-                       bool compactify,
-                       std::vector<Supervision> *output_supervision);
+                       Supervision *output_supervision);
 
 
 /// This function helps you to pseudo-randomly split a sequence of length 'num_frames',
diff --git a/src/nnet3/discriminative-supervision.cc b/src/nnet3/discriminative-supervision.cc
index 94a165f4c50..716650f16fa 100644
--- a/src/nnet3/discriminative-supervision.cc
+++ b/src/nnet3/discriminative-supervision.cc
@@ -400,49 +400,36 @@ void DiscriminativeSupervisionSplitter::ComputeLatticeScores(const Lattice &lat,
 }
 
 void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
-    bool compactify,
-    std::vector<DiscriminativeSupervision> *output_supervision) {
+    DiscriminativeSupervision *output_supervision) {
   KALDI_ASSERT(!input.empty());
   int32 num_inputs = input.size();
   if (num_inputs == 1) {
-    output_supervision->resize(1);
-    (*output_supervision)[0] = *(input[0]);
+    *output_supervision = *(input[0]);
     return;
   }
-  std::vector<bool> output_was_merged;
-  output_supervision->clear();
-  output_supervision->reserve(input.size());
-  for (int32 i = 0; i < input.size(); i++) {
+  *output_supervision = *(input[num_inputs-1]);
+  for (int32 i = num_inputs - 2; i >= 0; i--) {
     const DiscriminativeSupervision &src = *(input[i]);
     KALDI_ASSERT(src.num_sequences == 1);
-    if (compactify && !output_supervision->empty() &&
-        output_supervision->back().weight == src.weight &&
-        output_supervision->back().frames_per_sequence ==
+    if (output_supervision->weight == src.weight &&
+        output_supervision->frames_per_sequence ==
         src.frames_per_sequence) {
       // Combine with current output
       // append src.den_lat to output_supervision->den_lat.
-      fst::Concat(&output_supervision->back().den_lat, src.den_lat);
+      fst::Concat(src.den_lat, &output_supervision->den_lat);
 
-      output_supervision->back().num_ali.insert(
-          output_supervision->back().num_ali.end(),
+      output_supervision->num_ali.insert(
+          output_supervision->num_ali.begin(),
           src.num_ali.begin(), src.num_ali.end());
 
-      output_supervision->back().num_sequences++;
-      output_was_merged.back() = true;
+      output_supervision->num_sequences++;
     } else {
-      output_supervision->resize(output_supervision->size() + 1);
-      output_supervision->back() = src;
-      output_was_merged.push_back(false);
-    }
-  }
-  KALDI_ASSERT(output_was_merged.size() == output_supervision->size());
-  for (size_t i = 0; i < output_supervision->size(); i++) {
-    if (output_was_merged[i]) {
-      DiscriminativeSupervision &out_sup = (*output_supervision)[i];
-      fst::TopSort(&(out_sup.den_lat));
-      out_sup.Check();
+      KALDI_ERR << "Mismatch weight or frames_per_sequence  between inputs";
     }
   }
+  DiscriminativeSupervision &out_sup = *output_supervision;
+  fst::TopSort(&(out_sup.den_lat));
+  out_sup.Check();
 }
 
 } // namespace discriminative
diff --git a/src/nnet3/discriminative-supervision.h b/src/nnet3/discriminative-supervision.h
index a9d58d120f5..6fd684093e2 100644
--- a/src/nnet3/discriminative-supervision.h
+++ b/src/nnet3/discriminative-supervision.h
@@ -223,8 +223,7 @@ class DiscriminativeSupervisionSplitter {
 /// training; appending them like this helps to simplify the training process.
 
 void AppendSupervision(const std::vector<const DiscriminativeSupervision*> &input,
-    bool compactify,
-    std::vector<DiscriminativeSupervision> *output_supervision);
+    DiscriminativeSupervision *output_supervision);
 
 
 } // namespace discriminative
diff --git a/src/nnet3/nnet-chain-example.cc b/src/nnet3/nnet-chain-example.cc
index 351312fb952..aad5f83bc80 100644
--- a/src/nnet3/nnet-chain-example.cc
+++ b/src/nnet3/nnet-chain-example.cc
@@ -204,15 +204,10 @@ static void MergeSupervision(
   input_supervision.reserve(inputs.size());
   for (int32 n = 0; n < num_inputs; n++)
     input_supervision.push_back(&(inputs[n]->supervision));
-  std::vector<chain::Supervision> output_supervision;
-  bool compactify = true;
+  chain::Supervision output_supervision;
   AppendSupervision(input_supervision,
-                    compactify,
                     &output_supervision);
-  if (output_supervision.size() != 1)
-    KALDI_ERR << "Failed to merge 'chain' examples-- inconsistent lengths "
-              << "or weights?";
-  output->supervision.Swap(&(output_supervision[0]));
+  output->supervision.Swap(&output_supervision);
 
   output->indexes.clear();
   output->indexes.reserve(num_indexes);
diff --git a/src/nnet3/nnet-discriminative-example.cc b/src/nnet3/nnet-discriminative-example.cc
index 61a9669fb76..29eb65c30b1 100644
--- a/src/nnet3/nnet-discriminative-example.cc
+++ b/src/nnet3/nnet-discriminative-example.cc
@@ -196,15 +196,10 @@ void MergeSupervision(
   input_supervision.reserve(inputs.size());
   for (int32 n = 0; n < num_inputs; n++)
     input_supervision.push_back(&(inputs[n]->supervision));
-  std::vector<discriminative::DiscriminativeSupervision> output_supervision;
-  bool compactify = true;
+  discriminative::DiscriminativeSupervision output_supervision;
   discriminative::AppendSupervision(input_supervision,
-                         compactify,
                          &output_supervision);
-  if (output_supervision.size() != 1)
-    KALDI_ERR << "Failed to merge discriminative examples-- inconsistent lengths "
-              << "or weights?";
-  output->supervision.Swap(&(output_supervision[0]));
+  output->supervision.Swap(&(output_supervision));
 
   output->indexes.clear();
   output->indexes.reserve(num_indexes);