From d8ff7ee0cd9f5c670c36ad4525899e668d50abef Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Tue, 3 Apr 2018 00:53:59 -0400
Subject: [PATCH 01/93] make fst templates inline to eliminate linking errors
 in other places

---
 src/fstext/fstext-utils-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 923c67c07e2..756e449fcaa 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -1132,7 +1132,7 @@ inline bool IsStochasticFst(const Fst<LogArc> &fst,
 
 // Will override this for LogArc where NaturalLess will not work.
 template<class Arc>
-bool IsStochasticFst(const Fst<Arc> &fst,
+inline bool IsStochasticFst(const Fst<Arc> &fst,
                      float delta,
                      typename Arc::Weight *min_sum,
                      typename Arc::Weight *max_sum) {
@@ -1168,7 +1168,7 @@ bool IsStochasticFst(const Fst<Arc> &fst,
 
 // Overriding template for LogArc as NaturalLess does not work there.
 template<>
-bool IsStochasticFst(const Fst<LogArc> &fst,
+inline bool IsStochasticFst(const Fst<LogArc> &fst,
                      float delta,
                      LogArc::Weight *min_sum,
                      LogArc::Weight *max_sum) {
@@ -1208,7 +1208,7 @@ bool IsStochasticFst(const Fst<LogArc> &fst,
 // This function deals with the generic fst.
 // This version currently supports ConstFst<StdArc> or VectorFst<StdArc>.
 // Otherwise, it will be died with an error.
-bool IsStochasticFstInLog(const Fst<StdArc> &fst,
+inline bool IsStochasticFstInLog(const Fst<StdArc> &fst,
                           float delta,
                           StdArc::Weight *min_sum,
                           StdArc::Weight *max_sum) {

From fa53cc6a9866821b8606fa4cbce0e506948fb375 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Fri, 6 Apr 2018 09:01:10 -0400
Subject: [PATCH 02/93] tmp

---
 src/bin/latgen-biglm-faster-mapped.cc | 278 ++++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 src/bin/latgen-biglm-faster-mapped.cc

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
new file mode 100644
index 00000000000..18a3336540b
--- /dev/null
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -0,0 +1,278 @@
+// bin/latgen-biglm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeBiglmFasterDecoderConfig config;
+    
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    
+    VectorFst<StdArc> *new_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(new_lm_fst_rxfilename));
+
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+    fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From efccda41d514de1452b59bf1f890007b88d7876e Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c05.clsp.jhu.edu>
Date: Fri, 6 Apr 2018 12:26:29 -0400
Subject: [PATCH 03/93] 
 zchen@c05:/export/a12/zchen/works/decoder/egs/mini_librispeech/s5_otf$ bash
 run.biglm.sh

---
 src/bin/Makefile                      | 1 +
 src/bin/latgen-biglm-faster-mapped.cc | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 627c4f8a131..165eac6bb26 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,6 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
+BINFILES += latgen-biglm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 18a3336540b..10265551a89 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -26,6 +26,7 @@
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
 #include "base/timer.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
 
 
 namespace kaldi {
@@ -240,7 +241,7 @@ int main(int argc, char *argv[]) {
             continue;
           }
                 
-          DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
 
           double like;
           if (DecodeUtterance(decoder, decodable, trans_model, word_syms,

From ca4fb58c1dd3362b241ceb53f3634e9ce07f7c6f Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 6 Apr 2018 14:18:51 -0700
Subject: [PATCH 04/93] log

---
 src/decoder/lattice-biglm-faster-decoder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 6276c25a83d..ff337f81083 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -640,6 +640,8 @@ class LatticeBiglmFasterDecoder {
         }
       }
       if (tok_count != NULL) *tok_count = count;
+      KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size() - 1
+                    << " is " << tmp_array_.size();
       if (tmp_array_.size() <= static_cast<size_t>(config_.max_active)) {
         if (adaptive_beam) *adaptive_beam = config_.beam;
         return best_weight + config_.beam;

From a5da4f9f42783553c5c18e1e1d467ba3efd5f8eb Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw220.cm.cluster>
Date: Fri, 6 Apr 2018 16:30:29 -0700
Subject: [PATCH 05/93] prune

---
 src/decoder/lattice-biglm-faster-decoder.h | 69 ++++++++++++++--------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index ff337f81083..39c0658a830 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -630,36 +630,55 @@ class LatticeBiglmFasterDecoder {
       if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
       return best_weight + config_.beam;
     } else {
-      tmp_array_.clear();
-      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-        BaseFloat w = e->val->tot_cost;
-        tmp_array_.push_back(w);
-        if (w < best_weight) {
-          best_weight = w;
-          if (best_elem) *best_elem = e;
-        }
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
       }
-      if (tok_count != NULL) *tok_count = count;
-      KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size() - 1
-                    << " is " << tmp_array_.size();
-      if (tmp_array_.size() <= static_cast<size_t>(config_.max_active)) {
-        if (adaptive_beam) *adaptive_beam = config_.beam;
-        return best_weight + config_.beam;
-      } else {
-        // the lowest elements (lowest costs, highest likes)
-        // will be put in the left part of tmp_array.
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(),
+                       tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0) min_active_cutoff = best_weight;
+      else {
         std::nth_element(tmp_array_.begin(),
-                         tmp_array_.begin()+config_.max_active,
+                         tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
+                         tmp_array_.begin() + config_.max_active :
                          tmp_array_.end());
-        // return the tighter of the two beams.
-        BaseFloat ans = std::min(best_weight + config_.beam,
-                                 *(tmp_array_.begin()+config_.max_active));
-        if (adaptive_beam)
-          *adaptive_beam = std::min(config_.beam,
-                                    ans - best_weight + config_.beam_delta);
-        return ans;
+        min_active_cutoff = tmp_array_[config_.min_active];
       }
     }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+    }
   }
 
   inline StateId PropagateLm(StateId lm_state,

From dd1a532e96a31f44fa22ee15df56c55c6bcd51b2 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 7 Apr 2018 05:00:55 -0700
Subject: [PATCH 06/93] add profile

---
 src/bin/latgen-biglm-faster-mapped.cc      |  1 +
 src/decoder/lattice-biglm-faster-decoder.h | 36 ++++++++++++----------
 src/decoder/lattice-faster-decoder.cc      |  4 +--
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 10265551a89..e8bc461afe4 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -230,6 +230,7 @@ int main(int argc, char *argv[]) {
 
       {
         LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
     
         for (; !feature_reader.Done(); feature_reader.Next()) {
           std::string utt = feature_reader.Key();
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 39c0658a830..b13236c2970 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -615,21 +615,22 @@ class LatticeBiglmFasterDecoder {
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem) {
-    BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
-    // positive == high cost == bad.
-    size_t count = 0;
-    if (config_.max_active == std::numeric_limits<int32>::max()) {
-      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-        BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
-        if (w < best_weight) {
-          best_weight = w;
-          if (best_elem) *best_elem = e;
-        }
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
       }
-      if (tok_count != NULL) *tok_count = count;
-      if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
-      return best_weight + config_.beam;
-    } else {
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
     tmp_array_.clear();
     for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
       BaseFloat w = e->val->tot_cost;
@@ -678,7 +679,7 @@ class LatticeBiglmFasterDecoder {
       *adaptive_beam = config_.beam;
       return beam_cutoff;
     }
-    }
+  }
   }
 
   inline StateId PropagateLm(StateId lm_state,
@@ -713,7 +714,10 @@ class LatticeBiglmFasterDecoder {
     size_t tok_cnt;
     BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
     PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
-    
+    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
+
+  
     BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
     // pruning "online" before having seen all tokens
 
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 963430a63f1..161f9bf228a 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -699,8 +699,8 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   BaseFloat adaptive_beam;
   size_t tok_cnt;
   BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
-  KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
-                << adaptive_beam;
+  KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << NumFramesDecoded() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
 
   PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
 

From ebf91405b2fca3e4687f41daaac69196aca09279 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Sat, 7 Apr 2018 19:41:52 -0400
Subject: [PATCH 07/93] tmp

---
 src/bin/latgen-constlm-faster-mapped.cc | 278 ++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 src/bin/latgen-constlm-faster-mapped.cc

diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
new file mode 100644
index 00000000000..e986814628b
--- /dev/null
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -0,0 +1,278 @@
+// bin/latgen-constlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "decoder/lattice-constlm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeConstlmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeConstlmFasterDecoderConfig config;
+    
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    ConstArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+
+    ConstArpaLm new_lm;
+    ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
+    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeConstlmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 4dd846ed82f1c70b09aa957f15db7c3b163ed3c2 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 7 Apr 2018 18:36:55 -0700
Subject: [PATCH 08/93] single det

---
 src/bin/Makefile                        |  2 +-
 src/bin/latgen-biglm-faster-mapped.cc   |  2 +-
 src/bin/latgen-constlm-faster-mapped.cc | 15 ++++++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 165eac6bb26..5cf8ed63032 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index e8bc461afe4..548dff7533d 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
index e986814628b..caf8dbc5004 100644
--- a/src/bin/latgen-constlm-faster-mapped.cc
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -26,12 +26,13 @@
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
 #include "base/timer.h"
-#include "decoder/lattice-constlm-faster-decoder.h"
+#include "lm/const-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
 
 
 namespace kaldi {
 // Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeConstlmFasterDecoder &decoder, // not const but is really an input.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
                      DecodableInterface &decodable, // not const but is really an input.
                      const TransitionModel &trans_model,
                      const fst::SymbolTable *word_syms,
@@ -157,7 +158,7 @@ int main(int argc, char *argv[]) {
     Timer timer;
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
-    LatticeConstlmFasterDecoderConfig config;
+    LatticeBiglmFasterDecoderConfig config;
     
     std::string word_syms_filename;
     config.Register(&po);
@@ -185,18 +186,18 @@ int main(int argc, char *argv[]) {
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
+    /*
     ConstArpaLm old_lm;
     ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
     ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
 
     ConstArpaLm new_lm;
     ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
     ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
 
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
@@ -227,7 +228,7 @@ int main(int argc, char *argv[]) {
       Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
 
       {
-        LatticeConstlmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
         timer.Reset();
     
         for (; !feature_reader.Done(); feature_reader.Next()) {

From fc767523651ba2bf0acdf464a1bf5b7126ed18c6 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c05.clsp.jhu.edu>
Date: Sun, 8 Apr 2018 14:06:30 -0400
Subject: [PATCH 09/93] tmp

---
 src/bin/latgen-biglm-faster-mapped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 548dff7533d..e8bc461afe4 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;

From 451c471bc2ec9dcc189bac6bda7484afda71f43f Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw227.cm.cluster>
Date: Sun, 8 Apr 2018 12:37:31 -0700
Subject: [PATCH 10/93] otf-res ntok=1

---
 src/fstext/deterministic-fst-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index c6f99697e00..3dc49d04ff6 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -190,7 +190,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   Arc arc2;
   if (!fst2_->GetArc(pr.second, arc1.olabel, &arc2)) return false;
   std::pair<const std::pair<StateId, StateId>, StateId> new_value(
-      std::pair<StateId, StateId>(arc1.nextstate, arc2.nextstate),
+      std::pair<StateId, StateId>(arc1.nextstate, arc1.nextstate),
       next_state_);
   std::pair<IterType, bool> result =
       state_map_.insert(new_value);
@@ -199,7 +199,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   oarc->nextstate = result.first->second;
   oarc->weight = Times(arc1.weight, arc2.weight);
   if (result.second == true) { // was inserted
-    next_state_++;
+    //next_state_++;
     const std::pair<StateId, StateId> &new_pair (new_value.first);
     state_vec_.push_back(new_pair);
   }

From f66fc65336ab5bb9f144f24937b45c99ccaa23e8 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw227.cm.cluster>
Date: Sun, 8 Apr 2018 17:41:02 -0700
Subject: [PATCH 11/93] ntok=1

---
 src/decoder/lattice-biglm-faster-decoder.h | 45 ++++++++++++++++++++--
 src/fstext/deterministic-fst-inl.h         |  4 +-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index b13236c2970..33bc551d76c 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -64,11 +64,12 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
                  lm_diff_fst->Start() != fst::kNoStateId);
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
   LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
-    DeleteElems(toks_.Clear());    
+    DeleteElems(toks_.Clear());   
     ClearActiveTokens();
   }
 
@@ -87,6 +88,7 @@ class LatticeBiglmFasterDecoder {
     Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
     active_toks_[0].toks = start_tok;
     toks_.Insert(start_pair, start_tok);
+    toks_g1.Insert(PairToState(start_pair), start_pair);
     num_toks_++;
     ProcessNonemitting(0);
     
@@ -298,6 +300,7 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
+  typedef HashList<StateId, PairId>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -305,6 +308,9 @@ class LatticeBiglmFasterDecoder {
     if (new_sz > toks_.Size()) {
       toks_.SetSize(new_sz);
     }
+    if (new_sz > toks_g1.Size()) {
+      toks_g1.SetSize(new_sz);
+    }
   }
 
   // FindOrAddToken either locates a token in hash of toks_,
@@ -312,7 +318,7 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
+  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
@@ -349,7 +355,29 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-  
+   inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Elem_g1 *e_found = toks_g1.Find(PairToState(state_pair));
+    if (e_found == NULL) { // no such token presently.
+      toks_g1.Insert(PairToState(state_pair), state_pair);
+      return FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+    } else {
+      Elem* e_f = toks_.Find(e_found->val);
+      assert(e_f);
+      Token *tok = e_f->val; // There is an existing Token for this state.
+      if (tok->tot_cost > tot_cost) { // replace old token
+        e_found->val = state_pair;
+        tok = FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+      } else {
+        if (changed) *changed = false;
+      }
+      return tok;
+    }
+  }
+ 
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -441,6 +469,7 @@ class LatticeBiglmFasterDecoder {
         best_cost_nofinal = infinity;
     unordered_map<Token*, BaseFloat> tok_to_final_cost;
     Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
     for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
       PairId state_pair = e->key;
       StateId state = PairToState(state_pair),
@@ -709,6 +738,7 @@ class LatticeBiglmFasterDecoder {
   void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
     // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
     Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
     Elem *best_elem = NULL;
     BaseFloat adaptive_beam;
     size_t tok_cnt;
@@ -857,6 +887,7 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
+  HashList<StateId, PairId> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
@@ -886,6 +917,14 @@ class LatticeBiglmFasterDecoder {
       toks_.Delete(e);
     }
     toks_.Clear();
+    DeleteElems_1(toks_g1.Clear());
+  }
+  void DeleteElems_1(Elem_g1 *list) {
+    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_g1.Delete(e);
+    }
+    toks_g1.Clear();
   }
   
   void ClearActiveTokens() { // a cleanup routine, at utt end/begin
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index 3dc49d04ff6..c6f99697e00 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -190,7 +190,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   Arc arc2;
   if (!fst2_->GetArc(pr.second, arc1.olabel, &arc2)) return false;
   std::pair<const std::pair<StateId, StateId>, StateId> new_value(
-      std::pair<StateId, StateId>(arc1.nextstate, arc1.nextstate),
+      std::pair<StateId, StateId>(arc1.nextstate, arc2.nextstate),
       next_state_);
   std::pair<IterType, bool> result =
       state_map_.insert(new_value);
@@ -199,7 +199,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   oarc->nextstate = result.first->second;
   oarc->weight = Times(arc1.weight, arc2.weight);
   if (result.second == true) { // was inserted
-    //next_state_++;
+    next_state_++;
     const std::pair<StateId, StateId> &new_pair (new_value.first);
     state_vec_.push_back(new_pair);
   }

From a480cceaae1259ed2132f9dfeb5acdd14947b9be Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 8 Apr 2018 18:47:33 -0700
Subject: [PATCH 12/93] add beam in g1_map

---
 src/decoder/lattice-biglm-faster-decoder.h | 53 ++++++++++++----------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 33bc551d76c..eb640d99937 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -300,7 +300,7 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
-  typedef HashList<StateId, PairId>::Elem Elem_g1;
+  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -355,28 +355,27 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-   inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed) {
+#define res_beam 1
+   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed, bool pp) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
     KALDI_ASSERT(frame < active_toks_.size());
-    Elem_g1 *e_found = toks_g1.Find(PairToState(state_pair));
+    Elem_g1 *e_found = toks_g1.Find(state_id);
     if (e_found == NULL) { // no such token presently.
-      toks_g1.Insert(PairToState(state_pair), state_pair);
-      return FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+      toks_g1.Insert(state_id, tot_cost);
+      return true;
     } else {
-      Elem* e_f = toks_.Find(e_found->val);
-      assert(e_f);
-      Token *tok = e_f->val; // There is an existing Token for this state.
-      if (tok->tot_cost > tot_cost) { // replace old token
-        e_found->val = state_pair;
-        tok = FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
-      } else {
-        if (changed) *changed = false;
+      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
+        if (tot_cost < e_found->val)
+          e_found->val = tot_cost;
+        return true;
+      }
+      else {
+        return false;
       }
-      return tok;
     }
-  }
+   }
  
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
@@ -712,10 +711,12 @@ class LatticeBiglmFasterDecoder {
   }
 
   inline StateId PropagateLm(StateId lm_state,
-                             Arc *arc) { // returns new LM state.
+                             Arc *arc, bool *pp=NULL) { // returns new LM state.
     if (arc->olabel == 0) {
+      if (pp) *pp=false;
       return lm_state; // no change in LM state if no word crossed.
     } else { // Propagate in the LM-diff FST.
+      if (pp) *pp=false;
       Arc lm_arc;
       bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
       if (!ans) { // this case is unexpected for statistical LMs.
@@ -790,16 +791,18 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            StateId next_lm_state = PropagateLm(lm_state, &arc);
-            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel),
-                graph_cost = arc.weight.Value(),
+            bool pp;
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
+            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
+            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
+            BaseFloat graph_cost = arc.weight.Value(),
                 cur_cost = tok->tot_cost,
                 tot_cost = cur_cost + ac_cost + graph_cost;
             if (tot_cost > next_cutoff) continue;
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
+            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
@@ -861,13 +864,15 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          StateId next_lm_state = PropagateLm(lm_state, &arc);          
+          bool pp;
+          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
+          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
           BaseFloat graph_cost = arc.weight.Value(),
               tot_cost = cur_cost + graph_cost;
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
+            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
                                             false, &changed); // false: non-emit
             
             tok->links = new ForwardLink(new_tok, 0, arc.olabel,
@@ -887,7 +892,7 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
-  HashList<StateId, PairId> toks_g1;
+  HashList<StateId, BaseFloat> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).

From c0bf8268d33f1a563fd114c65ce8fc2455f57c71 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 8 Apr 2018 19:09:19 -0700
Subject: [PATCH 13/93] tiny

---
 src/decoder/lattice-biglm-faster-decoder.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index eb640d99937..429a16b2574 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -371,9 +371,12 @@ class LatticeBiglmFasterDecoder {
           e_found->val = tot_cost;
         return true;
       }
-      else {
+      else if (pp) {
         return false;
       }
+      else {
+        return true;
+      }
     }
    }
  
@@ -791,7 +794,7 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            bool pp;
+            bool pp=arc.olabel>0;
             BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
             if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
             StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
@@ -864,7 +867,7 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          bool pp;
+          bool pp=arc.olabel>0;
           if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
           StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
           BaseFloat graph_cost = arc.weight.Value(),

From d54d45a285ba1c16a6dba95d5127d638494f5e82 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Mon, 9 Apr 2018 12:31:49 -0700
Subject: [PATCH 14/93] tiny

---
 src/bin/latgen-biglm-faster-mapped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index e8bc461afe4..1f87572a4f3 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;

From e258ec183cd2dd12778762635719998bebaeea12 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 12 Apr 2018 22:28:37 -0400
Subject: [PATCH 15/93] tmp

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 286 ++++++++++++++++++++
 src/lm/faster-arpa-lm.cc                 |  36 +++
 src/lm/faster-arpa-lm.h                  | 324 +++++++++++++++++++++++
 3 files changed, 646 insertions(+)
 create mode 100644 src/bin/latgen-fasterlm-faster-mapped.cc
 create mode 100644 src/lm/faster-arpa-lm.cc
 create mode 100644 src/lm/faster-arpa-lm.h

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
new file mode 100644
index 00000000000..fe6ff62f6eb
--- /dev/null
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -0,0 +1,286 @@
+// bin/latgen-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeBiglmFasterDecoderConfig config;
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    options.Register(&po);
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename, -1);
+    FasterArpaLmDeterministicFst new_lm_dfst(old_lm);
+
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/lm/faster-arpa-lm.cc b/src/lm/faster-arpa-lm.cc
new file mode 100644
index 00000000000..81d0322ed5b
--- /dev/null
+++ b/src/lm/faster-arpa-lm.cc
@@ -0,0 +1,36 @@
+// lm/const-arpa-lm.cc
+
+// Copyright 2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <limits>
+#include <sstream>
+#include <utility>
+
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "lm/faster-arpa-lm.h"
+#include "util/stl-utils.h"
+#include "util/text-utils.h"
+
+
+namespace kaldi {
+
+
+
+}  // namespace kaldi
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
new file mode 100644
index 00000000000..a9e6f06ac20
--- /dev/null
+++ b/src/lm/faster-arpa-lm.h
@@ -0,0 +1,324 @@
+// lm/const-arpa-lm.h
+
+// Copyright 2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_FASTER_ARPA_LM_H_
+#define KALDI_LM_FASTER_ARPA_LM_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "fstext/deterministic-fst.h"
+#include "lm/arpa-file-parser.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+#define MAX_NGRAM 5+1
+
+class FasterArpaLm {
+ public:
+
+  // LmState in FasterArpaLm: the basic storage unit
+  class LmState {
+   public:
+    LmState() logprob_(0) { }
+    Allocate(NGram* ngram, float lm_scale=1): 
+    logprob_(ngram->logprob_*lm_scale), 
+    backoff_logprob_(ngram->backoff_logprob_*lm_scale) {
+      /*
+      std::vector<int32> &word_ids = ngram->words;
+      int32 ngram_order = word_ids.size();
+      int32 sz= sizeof(int32)*(ngram_order);
+      */
+    }
+    bool IsExist() { return logprob_!=0; };
+    ~LmState() { }
+
+    // for current query
+    float logprob_;
+    // for next query; can be optional
+    float backoff_logprob_;
+  };
+
+  // Class to build FasterArpaLm from Arpa format language model. It relies on the
+  // auxiliary class LmState above.
+  class FasterArpaLmBuilder : public ArpaFileParser {
+   public:
+    FasterArpaLmBuilder(ArpaParseOptions &options, FasterArpaLm *lm, 
+      float lm_scale = 1): 
+    lm_(lm), lm_scale_(lm_scale) { ArpaFileParser(options, NULL); }
+    ~FasterArpaLmBuilder() { }
+
+   protected:
+    // ArpaFileParser overrides.
+    virtual void HeaderAvailable() {
+      lm_->Allocate(NgramCounts(), Symbols());
+    }
+    virtual void ConsumeNGram(const NGram& ngram) {
+      LmState *lmstate = lm_->GetHashedState(ngram.words);
+      lmstate->Allocate(&ngram, lm_scale_);
+    }
+
+    virtual void ReadComplete()  { }
+
+   private:
+    FasterArpaLm *lm_;
+    float lm_scale_;
+  };
+
+  FasterArpaLm(ArpaParseOptions &options, const std::string& arpa_rxfilename,
+    float lm_scale=0) {
+    is_built_ = false;
+    ngram_order_ = 0;
+    num_words_ = 0;
+    lm_states_size_ = 0;
+    ngrams_ = NULL;
+    randint_per_word_gram_ = NULL;
+    options_ = options;
+
+    BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+  }
+
+  ~FasterArpaLm() {
+    if (is_built_) free();
+  }
+
+  inline LmState* GetHashedState(int32* word_ids, 
+      int query_ngram_order) {
+    assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
+    } else {
+      int32 hashed_idx=randint_per_word_gram_[0][word_ids[0]];
+      for (int i=1; i<ngram_order; i++) {
+        hashed_idx ^= randint_per_word_gram_[i][word_ids[i]];
+      }
+      return &ngrams_[ngram_order-1][hashed_idx & 
+          (ngrams_hashed_size_[ngram_order-1] - 1)];
+    }
+  }
+  inline LmState* GetHashedState(std::vector<int32> &word_ids, 
+      int query_ngram_order = 0) {
+    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
+    int32 word_ids_arr[MAX_NGRAM];
+    for (int i=0; i<query_ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return GetHashedState(word_ids_arr, ngram_order)
+  }
+
+  // if exist, get logprob_, else get backoff_logprob_
+  // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+  inline float GetNgramLogprob(const int32 *word_ids, 
+      const int32 ngram_order, 
+      std::std::vector<int32>& o_word_ids) {
+    float prob;
+    assert(ngram_order > 0);
+    if (ngram_order > ngram_order_) {
+      //while (wseq.size() >= lm_.NgramOrder()) {
+      // History state has at most lm_.NgramOrder() -1 words in the state.
+      // wseq.erase(wseq.begin(), wseq.begin() + 1);
+      //}
+      // we don't need to do above things as we do in reverse fashion:
+      //  memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+      ngram_order = ngram_order_;
+    }
+
+    LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    assert(lm_state);
+    if (lm_state->IsExist()) {
+      prob = lm_state->logprob_;
+      o_word_ids.resize(ngram_order);
+      for (int i=0; i<ngram_order; i++) {
+        o_word_ids[i] = word_ids[i];
+      }
+    } else {
+      LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+      prob = lm_state_bo->backoff_logprob_ + 
+        GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+    }
+    return prob;
+  }
+
+  bool BuildFasterArpaLm(const std::string& arpa_rxfilename, float lm_scale) {
+    FasterArpaLmBuilder lm_builder(options_, this, lm_scale);
+    KALDI_VLOG(1) << "Reading " << arpa_rxfilename;
+    Input ki(arpa_rxfilename);
+    lm_builder.Read(ki.Stream());
+    return true;
+  }
+
+ private:
+  void Allocate(const std::vector<int32>& ngram_count, 
+                const fst::SymbolTable* symbols) {
+    ngram_order_ = ngram_count.size();
+    uint64 max_rand = -1;
+    kaldi::RandomState rstate;
+    rstate.seed = 27437;
+    ngrams_ = malloc(ngram_order_ * sizeof(void*));
+    randint_per_word_gram_ = malloc(ngram_order_ * sizeof(void*));
+    ngrams_hashed_size_ = malloc(ngram_order_ * sizeof(int32));
+    for (int i=0; i< ngram_order_; i++) {
+      if (i == 0) ngrams_hashed_size_[i] = ngram_count[i]; // uni-gram
+      else {
+        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+                                 M_LN2 + 0.3));
+      }
+      KALDI_VLOG(2) << "ngram: "<< i <<" hashed_size/size = "<< 
+        ngrams_hashed_size_[i] / ngram_count[i];
+      ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
+      randint_per_word_gram_[i] = new int32[symbols->NumSymbols()];
+      for (int j=0; j<symbols->NumSymbols(); j++) {
+        randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
+      }
+    }
+    is_built_ = true;
+  }
+  void free() {
+    for (int i=0; i< ngram_order_; i++) {
+      delete ngrams_[i];
+      delete randint_per_word_gram_[i];
+    }
+    delete ngrams_;
+    delete randint_per_word_gram_;
+  }
+
+ private:
+  // configurations
+
+  // Indicating if FasterArpaLm has been built or not.
+  bool is_built_;
+  // N-gram order of language model. This can be figured out from "/data/"
+  // section in Arpa format language model.
+  int32 ngram_order_;
+  // Index of largest word-id plus one. It defines the end of <unigram_states_>
+  // array.
+  int32 num_words_;
+  // Size of the <lm_states_> array, which will be needed by I/O.
+  int64 lm_states_size_;
+  // Hash table from word sequences to LmStates.
+  unordered_map<std::vector<int32>,
+                LmState*, VectorHasher<int32> > seq_to_state_;
+  ArpaParseOptions &options;
+
+  // data
+
+  // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
+  LmState** ngrams_;
+  // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
+  uint64** randint_per_word_gram_;
+  int32* ngrams_hashed_size_;
+};
+
+
+/**
+ This class wraps a FasterArpaLm format language model with the interface defined
+ in DeterministicOnDemandFst.
+ */
+class FasterArpaLmDeterministicFst
+  : public fst::DeterministicOnDemandFst<fst::StdArc> {
+ public:
+  typedef fst::StdArc::Weight Weight;
+  typedef fst::StdArc::StateId StateId;
+  typedef fst::StdArc::Label Label;
+  typedef FasterArpaLm::LmState LmState;
+
+  explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
+    lm_(lm), start_state_(0) { 
+    // Creates a history state for <s>.
+    std::vector<Label> bos_state(1, lm_.BosSymbol());
+    state_to_wseq_.push_back(bos_state);
+    wseq_to_state_[bos_state] = 0;
+  }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual StateId Start() { return start_state_; }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual Weight Final(StateId s) {
+    // At this point, we should have created the state.
+    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+    const std::vector<Label>& wseq = state_to_wseq_[s];
+    std::vector<Label> wseq = state_to_wseq_[s];
+    std::vector<Label> owseq;
+    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    return Weight(-logprob);
+  }
+
+  float GetNgramLogprob(std::std::vector<int32> &wseq, int32 ilabel,
+    std::std::vector<int32> &owseq) {
+    int32 n = wseq.size();
+    int32 word_ids[MAX_NGRAM];
+    assert(n+1 <= MAX_NGRAM);
+
+    word_ids[0] = ilabel;
+    for (int i=n-1; i>=0; i-- ) {
+      word_ids[n-i] = wseq[i];
+    }
+
+    return lm_.GetNgramLogprob(word_ids, n+1, owseq);
+  }
+  virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
+    // At this point, we should have created the state.
+    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+
+    std::vector<Label> wseq = state_to_wseq_[s];
+    std::vector<Label> owseq;
+    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    if (logprob == std::numeric_limits<float>::min()) {
+      return false;
+    }
+
+    std::pair<const std::vector<Label>, StateId> wseq_state_pair(
+        owseq, static_cast<Label>(state_to_wseq_.size()));
+
+    // Attemps to insert the current <wseq_state_pair>. If the pair already exists
+    // then it returns false.
+    typedef MapType::iterator IterType;
+    std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
+
+    // If the pair was just inserted, then also add it to <state_to_wseq_>.
+    if (result.second == true)
+      state_to_wseq_.push_back(owseq);
+
+    // Creates the arc.
+    oarc->ilabel = ilabel;
+    oarc->olabel = ilabel;
+    oarc->nextstate = result.first->second;
+    oarc->weight = Weight(-logprob);
+
+    return true;
+  }
+
+ private:
+  typedef unordered_map<std::vector<Label>,
+                        StateId, VectorHasher<Label> > MapType;
+  StateId start_state_;
+  MapType wseq_to_state_;
+  std::vector<std::vector<Label> > state_to_wseq_;
+
+  const FasterArpaLm& lm_;
+};
+
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_CONST_ARPA_LM_H_

From 2b3dd85635b7d34540d24ad0839b92050c4bb220 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 10:37:39 -0700
Subject: [PATCH 16/93] 1st ver

---
 src/bin/Makefile                         |  2 +-
 src/bin/latgen-fasterlm-faster-mapped.cc | 21 +++++-
 src/lm/faster-arpa-lm.h                  | 95 +++++++++++++++---------
 3 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 5cf8ed63032..9ce73123612 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index fe6ff62f6eb..a30abcd6cbe 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -158,11 +158,23 @@ int main(int argc, char *argv[]) {
     Timer timer;
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0;
     LatticeBiglmFasterDecoderConfig config;
     config.Register(&po);
 
     ArpaParseOptions arpa_options;
-    options.Register(&po);
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
 
     std::string word_syms_filename;
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
@@ -177,6 +189,7 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
     
+    KALDI_LOG << RAND_MAX;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
         old_lm_fst_rxfilename = po.GetArg(3),
@@ -196,10 +209,10 @@ int main(int argc, char *argv[]) {
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
 
-    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename, -1);
-    FasterArpaLmDeterministicFst new_lm_dfst(old_lm);
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
 
-    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename);
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
     FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
 
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index a9e6f06ac20..c3960e0d590 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -31,24 +31,24 @@
 namespace kaldi {
 
 #define MAX_NGRAM 5+1
-
+#define RAND_TYPE int32
 class FasterArpaLm {
  public:
 
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState() logprob_(0) { }
-    Allocate(NGram* ngram, float lm_scale=1): 
-    logprob_(ngram->logprob_*lm_scale), 
-    backoff_logprob_(ngram->backoff_logprob_*lm_scale) {
+    LmState(): logprob_(0) { }
+    void Allocate(const NGram* ngram, float lm_scale=1) {
+      logprob_ = ngram->logprob*lm_scale;
+      backoff_logprob_ = ngram->backoff*lm_scale;
       /*
       std::vector<int32> &word_ids = ngram->words;
       int32 ngram_order = word_ids.size();
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
-    bool IsExist() { return logprob_!=0; };
+    bool IsExist() { return logprob_!=0; }
     ~LmState() { }
 
     // for current query
@@ -62,17 +62,19 @@ class FasterArpaLm {
   class FasterArpaLmBuilder : public ArpaFileParser {
    public:
     FasterArpaLmBuilder(ArpaParseOptions &options, FasterArpaLm *lm, 
-      float lm_scale = 1): 
-    lm_(lm), lm_scale_(lm_scale) { ArpaFileParser(options, NULL); }
+      float lm_scale = 1): ArpaFileParser(options, NULL),
+    lm_(lm), lm_scale_(lm_scale) { }
     ~FasterArpaLmBuilder() { }
 
    protected:
     // ArpaFileParser overrides.
     virtual void HeaderAvailable() {
-      lm_->Allocate(NgramCounts(), Symbols());
+      lm_->Allocate(NgramCounts(), 
+          Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
       LmState *lmstate = lm_->GetHashedState(ngram.words);
+      assert(lmstate);
       lmstate->Allocate(&ngram, lm_scale_);
     }
 
@@ -84,14 +86,14 @@ class FasterArpaLm {
   };
 
   FasterArpaLm(ArpaParseOptions &options, const std::string& arpa_rxfilename,
-    float lm_scale=0) {
+    int32 symbol_size, float lm_scale=1): symbol_size_(symbol_size), options_(options) {
+    assert(symbol_size_);
     is_built_ = false;
     ngram_order_ = 0;
     num_words_ = 0;
     lm_states_size_ = 0;
     ngrams_ = NULL;
     randint_per_word_gram_ = NULL;
-    options_ = options;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
   }
@@ -100,10 +102,16 @@ class FasterArpaLm {
     if (is_built_) free();
   }
 
-  inline LmState* GetHashedState(int32* word_ids, 
-      int query_ngram_order) {
+  int32 BosSymbol() const { return bos_symbol_; }
+  int32 EosSymbol() const { return eos_symbol_; }
+  int32 UnkSymbol() const { return unk_symbol_; }
+  int32 NgramOrder() const { return ngram_order_; }
+
+  inline LmState* GetHashedState(const int32* word_ids, 
+      int query_ngram_order) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
+    assert(word_ids[ngram_order-1] < ngrams_hashed_size_[ngram_order-1]);
     if (ngram_order == 1) {
       return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
     } else {
@@ -115,20 +123,21 @@ class FasterArpaLm {
           (ngrams_hashed_size_[ngram_order-1] - 1)];
     }
   }
-  inline LmState* GetHashedState(std::vector<int32> &word_ids, 
-      int query_ngram_order = 0) {
+  inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
+      int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
-    for (int i=0; i<query_ngram_order;i++) word_ids_arr[i]=word_ids[i];
-    return GetHashedState(word_ids_arr, ngram_order)
+    for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return GetHashedState(word_ids_arr, ngram_order);
   }
 
   // if exist, get logprob_, else get backoff_logprob_
   // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
   inline float GetNgramLogprob(const int32 *word_ids, 
-      const int32 ngram_order, 
-      std::std::vector<int32>& o_word_ids) {
+      const int32 word_ngram_order, 
+      std::vector<int32>& o_word_ids) const {
     float prob;
+    int32 ngram_order = word_ngram_order;
     assert(ngram_order > 0);
     if (ngram_order > ngram_order_) {
       //while (wseq.size() >= lm_.NgramOrder()) {
@@ -149,6 +158,7 @@ class FasterArpaLm {
         o_word_ids[i] = word_ids[i];
       }
     } else {
+      assert(ngram_order > 1); // thus we can do backoff
       LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
@@ -166,25 +176,29 @@ class FasterArpaLm {
 
  private:
   void Allocate(const std::vector<int32>& ngram_count, 
-                const fst::SymbolTable* symbols) {
+                int32 bos_symbol, int32 eos_symbol, 
+                int32 unk_symbol) {
+    bos_symbol_ = bos_symbol;
+    eos_symbol_ = eos_symbol;
+    unk_symbol_ = unk_symbol;
     ngram_order_ = ngram_count.size();
-    uint64 max_rand = -1;
+    RAND_TYPE max_rand = RAND_MAX;
     kaldi::RandomState rstate;
     rstate.seed = 27437;
-    ngrams_ = malloc(ngram_order_ * sizeof(void*));
-    randint_per_word_gram_ = malloc(ngram_order_ * sizeof(void*));
-    ngrams_hashed_size_ = malloc(ngram_order_ * sizeof(int32));
+    ngrams_ = (LmState**)malloc(ngram_order_ * sizeof(void*));
+    randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
+    ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
     for (int i=0; i< ngram_order_; i++) {
-      if (i == 0) ngrams_hashed_size_[i] = ngram_count[i]; // uni-gram
+      if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.3));
+                                 M_LN2 + 0.5));
       }
-      KALDI_VLOG(2) << "ngram: "<< i <<" hashed_size/size = "<< 
-        ngrams_hashed_size_[i] / ngram_count[i];
+      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
+        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
       ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
-      randint_per_word_gram_[i] = new int32[symbols->NumSymbols()];
-      for (int j=0; j<symbols->NumSymbols(); j++) {
+      randint_per_word_gram_[i] = new RAND_TYPE[symbol_size_];
+      for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
     }
@@ -204,9 +218,17 @@ class FasterArpaLm {
 
   // Indicating if FasterArpaLm has been built or not.
   bool is_built_;
+  // Integer corresponds to <s>.
+  int32 bos_symbol_;
+  // Integer corresponds to </s>.
+  int32 eos_symbol_;
+  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
+  // provided.
+  int32 unk_symbol_;  
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
+  int32 symbol_size_;
   // Index of largest word-id plus one. It defines the end of <unigram_states_>
   // array.
   int32 num_words_;
@@ -215,14 +237,14 @@ class FasterArpaLm {
   // Hash table from word sequences to LmStates.
   unordered_map<std::vector<int32>,
                 LmState*, VectorHasher<int32> > seq_to_state_;
-  ArpaParseOptions &options;
+  ArpaParseOptions &options_;
 
   // data
 
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
   LmState** ngrams_;
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
-  uint64** randint_per_word_gram_;
+  RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
 };
 
@@ -240,7 +262,7 @@ class FasterArpaLmDeterministicFst
   typedef FasterArpaLm::LmState LmState;
 
   explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
-    lm_(lm), start_state_(0) { 
+    start_state_(0), lm_(lm) { 
     // Creates a history state for <s>.
     std::vector<Label> bos_state(1, lm_.BosSymbol());
     state_to_wseq_.push_back(bos_state);
@@ -257,14 +279,13 @@ class FasterArpaLmDeterministicFst
     // At this point, we should have created the state.
     KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
     const std::vector<Label>& wseq = state_to_wseq_[s];
-    std::vector<Label> wseq = state_to_wseq_[s];
     std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    float logprob = GetNgramLogprob(wseq, lm_.EosSymbol(), owseq);
     return Weight(-logprob);
   }
 
-  float GetNgramLogprob(std::std::vector<int32> &wseq, int32 ilabel,
-    std::std::vector<int32> &owseq) {
+  float GetNgramLogprob(const std::vector<int32> &wseq, int32 ilabel,
+    std::vector<int32> &owseq) {
     int32 n = wseq.size();
     int32 word_ids[MAX_NGRAM];
     assert(n+1 <= MAX_NGRAM);

From 075db861cfaa4fb3a4463fa99181f6d946e45be4 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 10:52:03 -0700
Subject: [PATCH 17/93] tmp

---
 src/decoder/lattice-biglm-faster-decoder.h | 67 ++++------------------
 1 file changed, 10 insertions(+), 57 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 429a16b2574..b13236c2970 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -64,12 +64,11 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
                  lm_diff_fst->Start() != fst::kNoStateId);
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
-    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
   LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
-    DeleteElems(toks_.Clear());   
+    DeleteElems(toks_.Clear());    
     ClearActiveTokens();
   }
 
@@ -88,7 +87,6 @@ class LatticeBiglmFasterDecoder {
     Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
     active_toks_[0].toks = start_tok;
     toks_.Insert(start_pair, start_tok);
-    toks_g1.Insert(PairToState(start_pair), start_pair);
     num_toks_++;
     ProcessNonemitting(0);
     
@@ -300,7 +298,6 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
-  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -308,9 +305,6 @@ class LatticeBiglmFasterDecoder {
     if (new_sz > toks_.Size()) {
       toks_.SetSize(new_sz);
     }
-    if (new_sz > toks_g1.Size()) {
-      toks_g1.SetSize(new_sz);
-    }
   }
 
   // FindOrAddToken either locates a token in hash of toks_,
@@ -318,7 +312,7 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
+  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
@@ -355,31 +349,7 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-#define res_beam 1
-   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed, bool pp) {
-    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
-    // if the token was newly created or the cost changed.
-    KALDI_ASSERT(frame < active_toks_.size());
-    Elem_g1 *e_found = toks_g1.Find(state_id);
-    if (e_found == NULL) { // no such token presently.
-      toks_g1.Insert(state_id, tot_cost);
-      return true;
-    } else {
-      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
-        if (tot_cost < e_found->val)
-          e_found->val = tot_cost;
-        return true;
-      }
-      else if (pp) {
-        return false;
-      }
-      else {
-        return true;
-      }
-    }
-   }
- 
+  
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -471,7 +441,6 @@ class LatticeBiglmFasterDecoder {
         best_cost_nofinal = infinity;
     unordered_map<Token*, BaseFloat> tok_to_final_cost;
     Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
     for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
       PairId state_pair = e->key;
       StateId state = PairToState(state_pair),
@@ -714,12 +683,10 @@ class LatticeBiglmFasterDecoder {
   }
 
   inline StateId PropagateLm(StateId lm_state,
-                             Arc *arc, bool *pp=NULL) { // returns new LM state.
+                             Arc *arc) { // returns new LM state.
     if (arc->olabel == 0) {
-      if (pp) *pp=false;
       return lm_state; // no change in LM state if no word crossed.
     } else { // Propagate in the LM-diff FST.
-      if (pp) *pp=false;
       Arc lm_arc;
       bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
       if (!ans) { // this case is unexpected for statistical LMs.
@@ -742,7 +709,6 @@ class LatticeBiglmFasterDecoder {
   void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
     // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
     Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
     Elem *best_elem = NULL;
     BaseFloat adaptive_beam;
     size_t tok_cnt;
@@ -794,18 +760,16 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            bool pp=arc.olabel>0;
-            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
-            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
-            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
-            BaseFloat graph_cost = arc.weight.Value(),
+            StateId next_lm_state = PropagateLm(lm_state, &arc);
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel),
+                graph_cost = arc.weight.Value(),
                 cur_cost = tok->tot_cost,
                 tot_cost = cur_cost + ac_cost + graph_cost;
             if (tot_cost > next_cutoff) continue;
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
+            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
@@ -867,15 +831,13 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          bool pp=arc.olabel>0;
-          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
-          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
+          StateId next_lm_state = PropagateLm(lm_state, &arc);          
           BaseFloat graph_cost = arc.weight.Value(),
               tot_cost = cur_cost + graph_cost;
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
+            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
                                             false, &changed); // false: non-emit
             
             tok->links = new ForwardLink(new_tok, 0, arc.olabel,
@@ -895,7 +857,6 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
-  HashList<StateId, BaseFloat> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
@@ -925,14 +886,6 @@ class LatticeBiglmFasterDecoder {
       toks_.Delete(e);
     }
     toks_.Clear();
-    DeleteElems_1(toks_g1.Clear());
-  }
-  void DeleteElems_1(Elem_g1 *list) {
-    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
-      e_tail = e->tail;
-      toks_g1.Delete(e);
-    }
-    toks_g1.Clear();
   }
   
   void ClearActiveTokens() { // a cleanup routine, at utt end/begin

From bc477342d29fcec9ffd51610f73cc285da8c7a78 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 12:39:44 -0700
Subject: [PATCH 18/93] cmp with latgen-constlm-faster-mapped

---
 src/bin/latgen-constlm-faster-mapped.cc | 15 ++++++++-------
 src/lm/faster-arpa-lm.h                 | 18 +++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
index caf8dbc5004..6c83707e642 100644
--- a/src/bin/latgen-constlm-faster-mapped.cc
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -186,18 +186,19 @@ int main(int argc, char *argv[]) {
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
-    /*
-    ConstArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
 
     ConstArpaLm new_lm;
     ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
     ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
 
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index c3960e0d590..d139d9d98e7 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -99,7 +99,7 @@ class FasterArpaLm {
   }
 
   ~FasterArpaLm() {
-    if (is_built_) free();
+    if (is_built_) Free();
   }
 
   int32 BosSymbol() const { return bos_symbol_; }
@@ -192,25 +192,25 @@ class FasterArpaLm {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.5));
+                                 M_LN2 + 4));
       }
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
-      ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
-      randint_per_word_gram_[i] = new RAND_TYPE[symbol_size_];
+      ngrams_[i] = (LmState* )calloc(ngrams_hashed_size_[i], sizeof(LmState)) ;
+      randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
     }
     is_built_ = true;
   }
-  void free() {
+  void Free() {
     for (int i=0; i< ngram_order_; i++) {
-      delete ngrams_[i];
-      delete randint_per_word_gram_[i];
+      free(ngrams_[i]);
+      free(randint_per_word_gram_[i]);
     }
-    delete ngrams_;
-    delete randint_per_word_gram_;
+    free(ngrams_);
+    free(randint_per_word_gram_);
   }
 
  private:

From 5497ed4ef6203fc8171dceafa40cf743e575cd10 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 16:36:44 -0700
Subject: [PATCH 19/93] problem: because of hash colid,
 GetHashedState(word_ids, ngram_order) exist, while GetHashedState(word_ids,
 ngram_order-1) doesnt exist; thus cant correctly backoff, when using diff
 old&new lm, we find out this problem; latgen-fasterlm-faster-mapped
 --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3
 --verbose=7 --minimize=false --max-active=7000 --min-active=200 --beam=15.0
 --lattice-beam=8.0 --acoustic-scale=1.0 --allow-partial=true
 --word-symbol-table=exp/chain/tree_sp/graph_tgsmall//words.txt
 exp/chain/tdnn1e_sp/final.mdl data/lang_test_tgsmall//HCLG.fst 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' scp:tmp/feat.scp.1 ark:/dev/null
 ark,t:/dev/null

---
 src/lm/faster-arpa-lm.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index d139d9d98e7..505eeeb4aac 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -73,7 +73,7 @@ class FasterArpaLm {
           Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
-      LmState *lmstate = lm_->GetHashedState(ngram.words);
+      LmState *lmstate = lm_->GetHashedState(ngram.words, true);
       assert(lmstate);
       lmstate->Allocate(&ngram, lm_scale_);
     }
@@ -124,10 +124,13 @@ class FasterArpaLm {
     }
   }
   inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
-      int query_ngram_order = 0) const {
+       bool reverse = false, int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
-    for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    if (reverse)
+      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
+    else
+      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
     return GetHashedState(word_ids_arr, ngram_order);
   }
 
@@ -152,14 +155,27 @@ class FasterArpaLm {
     LmState *lm_state = GetHashedState(word_ids, ngram_order);
     assert(lm_state);
     if (lm_state->IsExist()) {
+      assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
-        o_word_ids[i] = word_ids[i];
+        o_word_ids[i] = word_ids[i]; 
+      }
+      if ( word_ids[0] == 82325 && word_ids[1]==84746) {
+        KALDI_LOG<<word_ids[0] <<" "<<ngram_order<<" "<<word_ids[ngram_order-1];
       }
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+#if 1
+      if (!lm_state_bo->IsExist()) {
+        KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
+        for (int i=0; i<ngram_order; i++) {
+          KALDI_WARN << word_ids[i];
+        }
+      }
+      assert(lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
+#endif
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
     }
@@ -291,8 +307,8 @@ class FasterArpaLmDeterministicFst
     assert(n+1 <= MAX_NGRAM);
 
     word_ids[0] = ilabel;
-    for (int i=n-1; i>=0; i-- ) {
-      word_ids[n-i] = wseq[i];
+    for (int i=0; i<n; i++ ) {
+      word_ids[i+1] = wseq[i];
     }
 
     return lm_.GetNgramLogprob(word_ids, n+1, owseq);

From 92e29ba9fd09fa9655e97922cf3ab52c0c1402b8 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 21:08:45 -0700
Subject: [PATCH 20/93] add linkedlist; fail when no reverse pair; -2.503347   
    4447 8537; seq in wseq is correct

---
 src/lm/faster-arpa-lm.h | 135 ++++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 34 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 505eeeb4aac..31cb8c0c5e0 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -38,7 +38,10 @@ class FasterArpaLm {
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState(): logprob_(0) { }
+    LmState(): logprob_(0), h_value(0), next(NULL) { }
+    LmState(float logprob, float backoff_logprob): 
+      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
+    next(NULL) { }
     void Allocate(const NGram* ngram, float lm_scale=1) {
       logprob_ = ngram->logprob*lm_scale;
       backoff_logprob_ = ngram->backoff*lm_scale;
@@ -48,13 +51,15 @@ class FasterArpaLm {
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
-    bool IsExist() { return logprob_!=0; }
+    bool IsExist() const { return logprob_!=0; }
     ~LmState() { }
 
     // for current query
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
+    int32 h_value;
+    LmState* next; // for colid
   };
 
   // Class to build FasterArpaLm from Arpa format language model. It relies on the
@@ -73,9 +78,8 @@ class FasterArpaLm {
           Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
-      LmState *lmstate = lm_->GetHashedState(ngram.words, true);
-      assert(lmstate);
-      lmstate->Allocate(&ngram, lm_scale_);
+      LmState lm_state(ngram.logprob * lm_scale_, ngram.backoff * lm_scale_);
+      lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
     virtual void ReadComplete()  { }
@@ -92,7 +96,6 @@ class FasterArpaLm {
     ngram_order_ = 0;
     num_words_ = 0;
     lm_states_size_ = 0;
-    ngrams_ = NULL;
     randint_per_word_gram_ = NULL;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
@@ -107,23 +110,80 @@ class FasterArpaLm {
   int32 UnkSymbol() const { return unk_symbol_; }
   int32 NgramOrder() const { return ngram_order_; }
 
-  inline LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order) const {
+  inline int32 GetHashedIdx(const int32* word_ids, 
+      int query_ngram_order, int32 *h_value=NULL) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
-    assert(word_ids[ngram_order-1] < ngrams_hashed_size_[ngram_order-1]);
+    int32 hashed_idx;
     if (ngram_order == 1) {
-      return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
+      hashed_idx = word_ids[ngram_order-1];
     } else {
-      int32 hashed_idx=randint_per_word_gram_[0][word_ids[0]];
-      for (int i=1; i<ngram_order; i++) {
-        hashed_idx ^= randint_per_word_gram_[i][word_ids[i]];
+      hashed_idx=randint_per_word_gram_[0][word_ids[0]];
+      for (int i=1; i<ngram_order_; i++) {
+        int word_id=i<ngram_order?word_ids[i]:0;
+        hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
-      return &ngrams_[ngram_order-1][hashed_idx & 
-          (ngrams_hashed_size_[ngram_order-1] - 1)];
+      if (h_value) *h_value = hashed_idx; // to check colid
+      hashed_idx &= 
+          (hash_size_except_uni_ - 1);
+    }
+    return hashed_idx;
+  }
+  inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
+    if (ngrams_map_.at(hashed_idx)) {
+      LmState *lm_state = ngrams_map_[hashed_idx];
+      while (lm_state->next) lm_state = lm_state->next;
+      lm_state->next = &ngrams_[ngrams_saved_num_];
+    } else {
+      ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
+    }
+  }
+  inline void SaveHashedState(const int32* word_ids, 
+      int query_ngram_order, LmState &lm_state_pattern) {
+    int32 h_value=0;
+    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    lm_state_pattern.h_value = h_value;
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      ngrams_[hashed_idx] = lm_state_pattern;
+    } else {
+      ngrams_[ngrams_saved_num_] = lm_state_pattern;
+      InsertHash(hashed_idx, ngrams_saved_num_++);
     }
   }
-  inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
+  inline void SaveHashedState(const std::vector<int32> &word_ids, LmState &lm_state_pattern,
+       bool reverse = false, int query_ngram_order = 0)  {
+    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
+    int32 word_ids_arr[MAX_NGRAM];
+    if (reverse)
+      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
+    else
+      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
+  }
+
+
+  inline const LmState* GetHashedState(const int32* word_ids, 
+      int query_ngram_order) const {
+    int32 h_value;
+    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      return &ngrams_[hashed_idx];
+    } else {
+      LmState *lm_state = ngrams_map_[hashed_idx];
+      while (lm_state) {
+        if (lm_state->h_value == h_value) {
+          return lm_state;
+        }
+        lm_state = lm_state->next;
+      }
+    }
+   
+    // not found, can be bug or really not found the corresponding ngram 
+    return NULL;
+  }
+  inline const LmState* GetHashedState(const std::vector<int32> &word_ids, 
        bool reverse = false, int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
@@ -152,21 +212,20 @@ class FasterArpaLm {
       ngram_order = ngram_order_;
     }
 
-    LmState *lm_state = GetHashedState(word_ids, ngram_order);
-    assert(lm_state);
-    if (lm_state->IsExist()) {
-      assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
+    const LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    if (lm_state) { //found out
+      assert(lm_state->IsExist());
+      //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
         o_word_ids[i] = word_ids[i]; 
       }
-      if ( word_ids[0] == 82325 && word_ids[1]==84746) {
-        KALDI_LOG<<word_ids[0] <<" "<<ngram_order<<" "<<word_ids[ngram_order-1];
-      }
     } else {
       assert(ngram_order > 1); // thus we can do backoff
-      LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+      const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+
+      assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
 #if 1
       if (!lm_state_bo->IsExist()) {
         KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
@@ -174,7 +233,6 @@ class FasterArpaLm {
           KALDI_WARN << word_ids[i];
         }
       }
-      assert(lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
 #endif
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
@@ -201,32 +259,37 @@ class FasterArpaLm {
     RAND_TYPE max_rand = RAND_MAX;
     kaldi::RandomState rstate;
     rstate.seed = 27437;
-    ngrams_ = (LmState**)malloc(ngram_order_ * sizeof(void*));
     randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
     ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
+    int32 acc=0;
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 4));
+        ngrams_hashed_size_[i] = ngram_count[i];
       }
-      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
-        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
-      ngrams_[i] = (LmState* )calloc(ngrams_hashed_size_[i], sizeof(LmState)) ;
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
+      acc+= ngrams_hashed_size_[i];
     }
+    hash_size_except_uni_ = acc - symbol_size_;
+    hash_size_except_uni_  = (1<<(int)ceil(log(hash_size_except_uni_) / 
+                                 M_LN2 + 0.5));
+    KALDI_VLOG(2) << " hashed_size/size = "<< 
+        1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
+    
+    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructor
+    ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
+    ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
   }
   void Free() {
     for (int i=0; i< ngram_order_; i++) {
-      free(ngrams_[i]);
       free(randint_per_word_gram_[i]);
     }
-    free(ngrams_);
     free(randint_per_word_gram_);
+    free(ngrams_);
   }
 
  private:
@@ -258,10 +321,14 @@ class FasterArpaLm {
   // data
 
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
-  LmState** ngrams_;
+  LmState* ngrams_;
+  int32 ngrams_saved_num_;
+
+  std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
   RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
+  int32 hash_size_except_uni_;
 };
 
 

From 3576de14f579c1a3a3e8390e740074c158f53bfc Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 05:33:38 -0700
Subject: [PATCH 21/93] tmp

---
 src/lm/faster-arpa-lm.h | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 31cb8c0c5e0..f81320f1c2e 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -22,6 +22,7 @@
 
 #include <string>
 #include <vector>
+#include <math.h>
 
 #include "base/kaldi-common.h"
 #include "fstext/deterministic-fst.h"
@@ -97,8 +98,10 @@ class FasterArpaLm {
     num_words_ = 0;
     lm_states_size_ = 0;
     randint_per_word_gram_ = NULL;
+    max_collision_ = 0;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+    KALDI_VLOG(2) << max_collision_;
   }
 
   ~FasterArpaLm() {
@@ -132,8 +135,13 @@ class FasterArpaLm {
   inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
     if (ngrams_map_.at(hashed_idx)) {
       LmState *lm_state = ngrams_map_[hashed_idx];
-      while (lm_state->next) lm_state = lm_state->next;
+      int32 cnt=0;
+      while (lm_state->next) {
+        lm_state = lm_state->next;
+        cnt++;
+      }
       lm_state->next = &ngrams_[ngrams_saved_num_];
+      max_collision_=std::max(cnt,max_collision_);
     } else {
       ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
     }
@@ -159,6 +167,7 @@ class FasterArpaLm {
       for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
     else
       for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+
     return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
   }
 
@@ -225,17 +234,10 @@ class FasterArpaLm {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
 
-      assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
-#if 1
-      if (!lm_state_bo->IsExist()) {
-        KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
-        for (int i=0; i<ngram_order; i++) {
-          KALDI_WARN << word_ids[i];
-        }
-      }
-#endif
-      prob = lm_state_bo->backoff_logprob_ + 
-        GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+      //assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 84746 4447 8537 without 4447 8537 in LM
+
+      prob = lm_state_bo? lm_state_bo->backoff_logprob_:0;
+      prob += GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
     }
     return prob;
   }
@@ -329,6 +331,7 @@ class FasterArpaLm {
   RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
   int32 hash_size_except_uni_;
+  int32 max_collision_;
 };
 
 

From acccbdea01f1382110761bb4253572e7be90a3e2 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw223.cm.cluster>
Date: Sat, 14 Apr 2018 07:57:09 -0700
Subject: [PATCH 22/93] test shows that small & med are with the same score:
 ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007
 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7 
 'fstproject --project_output=true data/lang_test_tgmed/G.fst | fstarcsort
 --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c
 data/local/lm/3-gram.pruned.1e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 2>&1 | tee /tmp/log;
 ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007
 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7 
 'fstproject --project_output=true data/lang_test_tgsmall/G.fst | fstarcsort
 --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 2>&1 | tee /tmp/log;

---
 src/bin/latgen-fasterlm-faster-mapped.cc |   8 +-
 src/lm/Makefile                          |   4 +-
 src/lm/faster-arpa-lm-test.cc            | 122 +++++++++++++++++++++++
 3 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 src/lm/faster-arpa-lm-test.cc

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index a30abcd6cbe..637df4896fa 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -208,9 +208,15 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-
+#if 0
     FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+#else
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+#endif
 
     FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
     FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 3dfb409f970..b0b221a795d 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -6,8 +6,10 @@ include ../kaldi.mk
 
 TESTFILES = arpa-file-parser-test arpa-lm-compiler-test
 
+TESTFILES += faster-arpa-lm-test
+
 OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
-	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o  faster-arpa-lm.o 
 
 LIBNAME = kaldi-lm
 
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
new file mode 100644
index 00000000000..e8d327fc975
--- /dev/null
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -0,0 +1,122 @@
+// bin/latgen-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "lm/const-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+//echo 14207 198712 7589 175861 175861 104488 150861 139719 78075 14268 124782 61783 196158 4 20681 194454 137421 158810 161569 4 37434 50498 | awk '{for (i=1;i<=NF;i++)printf $i", "}END{print "\n"NF}'
+// ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7  'fstproject --project_output=true data/lang_test_tgmed/G.fst | fstarcsort --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c data/local/lm/3-gram.pruned.1e-7.arpa.gz| utils/map_arpa_lm.pl data/lang_test_tgsmall/words.txt|'
+//
+namespace kaldi {
+
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+#define  Arc fst::StdArc
+    using fst::ReadFstKaldi;
+
+
+void get_score(fst::CacheDeterministicOnDemandFst<StdArc>* cache_dfst,
+    int* word_ids, int* state_ids, float* scores, int len) {
+  state_ids[0]=cache_dfst->Start();
+  std::cout << "word,state,score: \n";
+  for (int i =0;i<len;i++) {
+  Arc lm_arc;
+  assert(cache_dfst->GetArc(state_ids[i], word_ids[i], &lm_arc));
+  if (i< len-1) state_ids[i+1]=lm_arc.nextstate;
+  scores[i]=lm_arc.weight.Value();
+  std::cout <<word_ids[i]<<","<<state_ids[i]<<","<<scores[i]<<"\n";
+  }
+}
+}
+int main(int argc, char *argv[]) {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+#define  Arc fst::StdArc
+    using fst::ReadFstKaldi;
+
+#define TEST_SIZE 25
+    ParseOptions po("");
+    float scores[TEST_SIZE];
+    float scores2[TEST_SIZE];
+    float scores3[TEST_SIZE];
+    int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    int32 state_ids[TEST_SIZE]={0};
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    int32 symbol_size;
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+    po.Read(argc, argv);
+
+    {
+    std::string g_lm_fst_rxfilename = po.GetArg(1);
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(g_lm_fst_rxfilename));
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&old_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores, TEST_SIZE);
+    }
+   {
+    std::string g_lm_fst_rxfilename = po.GetArg(2);
+    ConstArpaLm new_lm;
+    ReadKaldiObject(g_lm_fst_rxfilename, &new_lm);
+    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores2, TEST_SIZE);
+    }
+   {
+    std::string g_lm_fst_rxfilename = po.GetArg(3);
+    FasterArpaLm new_lm(arpa_options, g_lm_fst_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores3, TEST_SIZE);
+   }
+   for (int i=0;i<TEST_SIZE;i++) {
+     if (scores[i]!=scores2[i]) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
+     if (scores[i]!=scores3[i]) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
+   }
+   return 0;
+}

From 77afac8b4f7963d86d969ab477fa8551236e25de Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 11:10:43 -0700
Subject: [PATCH 23/93] found out it's h_value problem; add a hack to reduce
 colid

---
 src/bin/latgen-fasterlm-faster-mapped.cc |  2 +-
 src/lm/faster-arpa-lm-test.cc            |  6 ++++--
 src/lm/faster-arpa-lm.h                  | 15 ++++++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 637df4896fa..42b310f3100 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-#if 0
+#if 1
     FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
 #else
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index e8d327fc975..44fd25e8d95 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,12 +67,14 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 25
+#define TEST_SIZE 28
+//#define TEST_SIZE 25
     ParseOptions po("");
     float scores[TEST_SIZE];
     float scores2[TEST_SIZE];
     float scores3[TEST_SIZE];
-    int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index f81320f1c2e..fb69360a1fe 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -123,7 +123,8 @@ class FasterArpaLm {
     } else {
       hashed_idx=randint_per_word_gram_[0][word_ids[0]];
       for (int i=1; i<ngram_order_; i++) {
-        int word_id=i<ngram_order?word_ids[i]:0;
+        int word_id=i<ngram_order?word_ids[i]:
+          (word_ids[i-ngram_order] + i + ngram_order); // this is totally a hack
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
       if (h_value) *h_value = hashed_idx; // to check colid
@@ -226,10 +227,22 @@ class FasterArpaLm {
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
+      /*
+      for (int i=0; i<ngram_order; i++) {
+        std::cout<<word_ids[i]<<" ";
+      }
+      std::cout<<ngram_order<<" "<<prob<<"\n";
+      */
+      // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
+      ngram_order = std::min(ngram_order,ngram_order_-1);
+      while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
+      assert(ngram_order>0);
+
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
         o_word_ids[i] = word_ids[i]; 
       }
+
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 

From 907d914db55417f7dc731d2509bac923253c974f Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 12:10:05 -0700
Subject: [PATCH 24/93] separate ngrams_map; use uint64 rand()

---
 src/bin/latgen-fasterlm-faster-mapped.cc |  1 -
 src/lm/faster-arpa-lm-test.cc            | 10 +++--
 src/lm/faster-arpa-lm.h                  | 48 +++++++++++++++---------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 42b310f3100..be2fce3e01d 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -189,7 +189,6 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
     
-    KALDI_LOG << RAND_MAX;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
         old_lm_fst_rxfilename = po.GetArg(3),
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index 44fd25e8d95..b542c662988 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,14 +67,16 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 28
+#define TEST_SIZE 26
+//#define TEST_SIZE 28
 //#define TEST_SIZE 25
     ParseOptions po("");
     float scores[TEST_SIZE];
     float scores2[TEST_SIZE];
     float scores3[TEST_SIZE];
     //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
-    int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
+    //int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
+    int32 word_ids[] = {14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 124782, 19206, 53865, 137753, 2279, 32505, 153074, 4, 37434, 50498};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
@@ -117,8 +119,8 @@ int main(int argc, char *argv[]) {
     get_score(&cache_dfst, word_ids, state_ids, scores3, TEST_SIZE);
    }
    for (int i=0;i<TEST_SIZE;i++) {
-     if (scores[i]!=scores2[i]) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
-     if (scores[i]!=scores3[i]) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
+     if (abs(scores[i]-scores2[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
+     if (abs(scores[i]-scores3[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
    }
    return 0;
 }
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index fb69360a1fe..13d7d0e1b53 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -31,8 +31,16 @@
 
 namespace kaldi {
 
+uint64  RandInt64() {
+  uint64_t random =
+  (((uint64_t) rand() <<  0) & 0x000000000000FFFFull) ^ 
+  (((uint64_t) rand() << 16) & 0x00000000FFFF0000ull) ^ 
+  (((uint64_t) rand() << 32) & 0x0000FFFF00000000ull) ^
+  (((uint64_t) rand() << 48) & 0xFFFF000000000000ull);
+  return random;
+}
 #define MAX_NGRAM 5+1
-#define RAND_TYPE int32
+#define RAND_TYPE int64
 class FasterArpaLm {
  public:
 
@@ -122,14 +130,15 @@ class FasterArpaLm {
       hashed_idx = word_ids[ngram_order-1];
     } else {
       hashed_idx=randint_per_word_gram_[0][word_ids[0]];
-      for (int i=1; i<ngram_order_; i++) {
-        int word_id=i<ngram_order?word_ids[i]:
-          (word_ids[i-ngram_order] + i + ngram_order); // this is totally a hack
+      for (int i=1; i<ngram_order; i++) {
+        int word_id=word_ids[i];
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
       if (h_value) *h_value = hashed_idx; // to check colid
+      int i = ngram_order-1;
       hashed_idx &= 
-          (hash_size_except_uni_ - 1);
+          (ngrams_hashed_size_[i]-ngrams_hashed_size_[i-1] - 1);
+      hashed_idx += ngrams_hashed_size_[i-1];
     }
     return hashed_idx;
   }
@@ -227,12 +236,13 @@ class FasterArpaLm {
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
-      /*
+     
+/* 
       for (int i=0; i<ngram_order; i++) {
         std::cout<<word_ids[i]<<" ";
       }
       std::cout<<ngram_order<<" "<<prob<<"\n";
-      */
+  */    
       // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
       ngram_order = std::min(ngram_order,ngram_order_-1);
       while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
@@ -271,26 +281,30 @@ class FasterArpaLm {
     eos_symbol_ = eos_symbol;
     unk_symbol_ = unk_symbol;
     ngram_order_ = ngram_count.size();
-    RAND_TYPE max_rand = RAND_MAX;
-    kaldi::RandomState rstate;
-    rstate.seed = 27437;
+    srand(0);
     randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
     ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
     int32 acc=0;
+    int32 acc_hashed=0;
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = ngram_count[i];
+        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+                                 M_LN2 + 0.5));
       }
+      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
+        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
-        randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
+        randint_per_word_gram_[i][j] = RandInt64(); 
       }
-      acc+= ngrams_hashed_size_[i];
+      acc+= ngram_count[i];
+      acc_hashed+= ngrams_hashed_size_[i];
+      if (i==0) ngrams_hashed_size_[i]=0;
+      else ngrams_hashed_size_[i]+=ngrams_hashed_size_[i-1];
     }
-    hash_size_except_uni_ = acc - symbol_size_;
-    hash_size_except_uni_  = (1<<(int)ceil(log(hash_size_except_uni_) / 
-                                 M_LN2 + 0.5));
+    hash_size_except_uni_ = acc_hashed - symbol_size_;
+    assert(ngrams_hashed_size_[ngram_order_-1]==hash_size_except_uni_);
     KALDI_VLOG(2) << " hashed_size/size = "<< 
         1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
     
@@ -342,7 +356,7 @@ class FasterArpaLm {
   std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
   RAND_TYPE** randint_per_word_gram_;
-  int32* ngrams_hashed_size_;
+  int32* ngrams_hashed_size_; //after init, it's an accumulate value
   int32 hash_size_except_uni_;
   int32 max_collision_;
 };

From 7f272fdc948641c78ecc6bb539b3b7a3a90e6a39 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 12:30:19 -0700
Subject: [PATCH 25/93] match performance in exp_dec/constlm.1a/dec.log; but
 still larger toks

---
 src/lm/faster-arpa-lm-test.cc |  4 ++--
 src/lm/faster-arpa-lm.h       | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index b542c662988..9a72289be23 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 26
+#define TEST_SIZE 39
 //#define TEST_SIZE 28
 //#define TEST_SIZE 25
     ParseOptions po("");
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
     float scores3[TEST_SIZE];
     //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
     //int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
-    int32 word_ids[] = {14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 124782, 19206, 53865, 137753, 2279, 32505, 153074, 4, 37434, 50498};
+    int32 word_ids[] = {78521, 148206, 178313, 175861, 144826, 28459, 25372, 62655, 138328, 175861, 72352, 76155, 152997, 4, 102911, 177031, 193231, 127711, 71590, 47932, 151710, 40606, 5411, 82074, 86219, 81505, 77097, 4, 155384, 194419, 193822, 71589, 76098, 163928, 124918, 177084, 9376, 81505, 78840};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 13d7d0e1b53..fab511c0ea5 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -67,7 +67,7 @@ class FasterArpaLm {
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
-    int32 h_value;
+    RAND_TYPE h_value;
     LmState* next; // for colid
   };
 
@@ -122,10 +122,10 @@ class FasterArpaLm {
   int32 NgramOrder() const { return ngram_order_; }
 
   inline int32 GetHashedIdx(const int32* word_ids, 
-      int query_ngram_order, int32 *h_value=NULL) const {
+      int query_ngram_order, RAND_TYPE *h_value=NULL) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
-    int32 hashed_idx;
+    RAND_TYPE hashed_idx;
     if (ngram_order == 1) {
       hashed_idx = word_ids[ngram_order-1];
     } else {
@@ -134,7 +134,7 @@ class FasterArpaLm {
         int word_id=word_ids[i];
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
-      if (h_value) *h_value = hashed_idx; // to check colid
+      if (h_value) *h_value = hashed_idx; // to check colid, h_value should be precise
       int i = ngram_order-1;
       hashed_idx &= 
           (ngrams_hashed_size_[i]-ngrams_hashed_size_[i-1] - 1);
@@ -158,7 +158,7 @@ class FasterArpaLm {
   }
   inline void SaveHashedState(const int32* word_ids, 
       int query_ngram_order, LmState &lm_state_pattern) {
-    int32 h_value=0;
+    RAND_TYPE h_value=0;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     lm_state_pattern.h_value = h_value;
     int32 ngram_order = query_ngram_order;
@@ -184,7 +184,7 @@ class FasterArpaLm {
 
   inline const LmState* GetHashedState(const int32* word_ids, 
       int query_ngram_order) const {
-    int32 h_value;
+    RAND_TYPE h_value;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {

From 7c65ea5ca28ae9de32362108efde4b01f9476595 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 13:00:27 -0700
Subject: [PATCH 26/93] add otfres;

---
 src/bin/Makefile                              |   2 +-
 .../latgen-otfres-fasterlm-faster-mapped.cc   | 304 ++++++
 .../lattice-otfres-biglm-faster-decoder.h     | 957 ++++++++++++++++++
 3 files changed, 1262 insertions(+), 1 deletion(-)
 create mode 100644 src/bin/latgen-otfres-fasterlm-faster-mapped.cc
 create mode 100644 src/decoder/lattice-otfres-biglm-faster-decoder.h

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 9ce73123612..439353b06eb 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
new file mode 100644
index 00000000000..ad475f9405f
--- /dev/null
+++ b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
@@ -0,0 +1,304 @@
+// bin/latgen-otfres-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-otfres-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0;
+    LatticeBiglmFasterDecoderConfig config;
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+#if 1
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+#else
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+#endif
+
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/decoder/lattice-otfres-biglm-faster-decoder.h b/src/decoder/lattice-otfres-biglm-faster-decoder.h
new file mode 100644
index 00000000000..841547b9cca
--- /dev/null
+++ b/src/decoder/lattice-otfres-biglm-faster-decoder.h
@@ -0,0 +1,957 @@
+// decoder/lattice-otfres-biglm-faster-decoder.h
+
+// Copyright 2009-2011  Microsoft Corporation, Mirko Hannemann,
+//              Gilles Boulianne
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
+#define KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
+
+
+#include "util/stl-utils.h"
+#include "util/hash-list.h"
+#include "fst/fstlib.h"
+#include "itf/decodable-itf.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "decoder/lattice-faster-decoder.h" // for options.
+
+
+namespace kaldi {
+
+// The options are the same as for lattice-faster-decoder.h for now.
+typedef LatticeFasterDecoderConfig LatticeBiglmFasterDecoderConfig;
+
+/** This is as LatticeFasterDecoder, but does online composition between
+    HCLG and the "difference language model", which is a deterministic
+    FST that represents the difference between the language model you want
+    and the language model you compiled HCLG with.  The class
+    DeterministicOnDemandFst follows through the epsilons in G for you
+    (assuming G is a standard backoff language model) and makes it look
+    like a determinized FST.
+*/
+
+class LatticeBiglmFasterDecoder {
+ public:
+  typedef fst::StdArc Arc;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+  // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
+  typedef uint64 PairId;
+  typedef Arc::Weight Weight;
+  // instantiate this class once for each thing you have to decode.
+  LatticeBiglmFasterDecoder(
+      const fst::Fst<fst::StdArc> &fst,      
+      const LatticeBiglmFasterDecoderConfig &config,
+      fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst):
+      fst_(fst), lm_diff_fst_(lm_diff_fst), config_(config),
+      warned_noarc_(false), num_toks_(0) {
+    config.Check();
+    KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
+                 lm_diff_fst->Start() != fst::kNoStateId);
+    toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
+  }
+  void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
+  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
+  ~LatticeBiglmFasterDecoder() {
+    DeleteElems(toks_.Clear());   
+    ClearActiveTokens();
+  }
+
+  // Returns true if any kind of traceback is available (not necessarily from
+  // a final state).
+  bool Decode(DecodableInterface *decodable) {
+    // clean up from last time:
+    DeleteElems(toks_.Clear());
+    ClearActiveTokens();
+    warned_ = false;
+    final_active_ = false;
+    final_costs_.clear();
+    num_toks_ = 0;
+    PairId start_pair = ConstructPair(fst_.Start(), lm_diff_fst_->Start());
+    active_toks_.resize(1);
+    Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
+    active_toks_[0].toks = start_tok;
+    toks_.Insert(start_pair, start_tok);
+    toks_g1.Insert(PairToState(start_pair), start_pair);
+    num_toks_++;
+    ProcessNonemitting(0);
+    
+    // We use 1-based indexing for frames in this decoder (if you view it in
+    // terms of features), but note that the decodable object uses zero-based
+    // numbering, which we have to correct for when we call it.
+    for (int32 frame = 1; !decodable->IsLastFrame(frame-2); frame++) {
+      active_toks_.resize(frame+1); // new column
+
+      ProcessEmitting(decodable, frame);
+      
+      ProcessNonemitting(frame);
+
+      if (decodable->IsLastFrame(frame-1))
+        PruneActiveTokensFinal(frame);
+      else if (frame % config_.prune_interval == 0)
+        PruneActiveTokens(frame, config_.lattice_beam * 0.1); // use larger delta.        
+    }
+    // Returns true if we have any kind of traceback available (not necessarily
+    // to the end state; query ReachedFinal() for that).
+    return !final_costs_.empty();
+  }
+
+  /// says whether a final-state was active on the last frame.  If it was not, the
+  /// lattice (or traceback) will end with states that are not final-states.
+  bool ReachedFinal() const { return final_active_; }
+
+
+  // Outputs an FST corresponding to the single best path
+  // through the lattice.
+  bool GetBestPath(fst::MutableFst<LatticeArc> *ofst, 
+                   bool use_final_probs = true) const {
+    fst::VectorFst<LatticeArc> fst;
+    if (!GetRawLattice(&fst, use_final_probs)) return false;
+    // std::cout << "Raw lattice is:\n";
+    // fst::FstPrinter<LatticeArc> fstprinter(fst, NULL, NULL, NULL, false, true);
+    // fstprinter.Print(&std::cout, "standard output");
+    ShortestPath(fst, ofst);
+    return true;
+  }
+
+  // Outputs an FST corresponding to the raw, state-level
+  // tracebacks.
+  bool GetRawLattice(fst::MutableFst<LatticeArc> *ofst,
+                     bool use_final_probs = true) const {
+    typedef LatticeArc Arc;
+    typedef Arc::StateId StateId;
+    // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
+    typedef uint64 PairId;
+    typedef Arc::Weight Weight;
+    typedef Arc::Label Label;
+    ofst->DeleteStates();
+    // num-frames plus one (since frames are one-based, and we have
+    // an extra frame for the start-state).
+    int32 num_frames = active_toks_.size() - 1;
+    KALDI_ASSERT(num_frames > 0);
+    unordered_map<Token*, StateId> tok_map(num_toks_/2 + 3); // bucket count
+    // First create all states.
+    for (int32 f = 0; f <= num_frames; f++) {
+      if (active_toks_[f].toks == NULL) {
+        KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
+                   << ": not producing lattice.\n";
+        return false;
+      }
+      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next)
+        tok_map[tok] = ofst->AddState();
+      // The next statement sets the start state of the output FST.
+      // Because we always add new states to the head of the list
+      // active_toks_[f].toks, and the start state was the first one
+      // added, it will be the last one added to ofst.
+      if (f == 0 && ofst->NumStates() > 0)
+        ofst->SetStart(ofst->NumStates()-1);
+    }
+    KALDI_VLOG(3) << "init:" << num_toks_/2 + 3 << " buckets:" 
+                  << tok_map.bucket_count() << " load:" << tok_map.load_factor() 
+                  << " max:" << tok_map.max_load_factor();
+    // Now create all arcs.
+    StateId cur_state = 0; // we rely on the fact that we numbered these
+    // consecutively (AddState() returns the numbers in order..)
+    for (int32 f = 0; f <= num_frames; f++) {
+      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next,
+               cur_state++) {
+        for (ForwardLink *l = tok->links;
+             l != NULL;
+             l = l->next) {
+          unordered_map<Token*, StateId>::const_iterator iter =
+              tok_map.find(l->next_tok);
+          StateId nextstate = iter->second;
+          KALDI_ASSERT(iter != tok_map.end());
+          Arc arc(l->ilabel, l->olabel,
+                  Weight(l->graph_cost, l->acoustic_cost),
+                  nextstate);
+          ofst->AddArc(cur_state, arc);
+        }
+        if (f == num_frames) {
+          if (use_final_probs && !final_costs_.empty()) {
+            std::map<Token*, BaseFloat>::const_iterator iter =
+                final_costs_.find(tok);
+            if (iter != final_costs_.end())
+              ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
+          } else {
+            ofst->SetFinal(cur_state, LatticeWeight::One());
+          }
+        }
+      }
+    }
+    KALDI_ASSERT(cur_state == ofst->NumStates());
+    return (cur_state != 0);
+  }
+
+  // This function is now deprecated, since now we do determinization from
+  // outside the LatticeBiglmFasterDecoder class.
+  // Outputs an FST corresponding to the lattice-determinized
+  // lattice (one path per word sequence).
+  bool GetLattice(fst::MutableFst<CompactLatticeArc> *ofst,
+                  bool use_final_probs = true) const {
+    Lattice raw_fst;
+    if (!GetRawLattice(&raw_fst, use_final_probs)) return false;
+    Invert(&raw_fst); // make it so word labels are on the input.
+    if (!TopSort(&raw_fst)) // topological sort makes lattice-determinization more efficient
+      KALDI_WARN << "Topological sorting of state-level lattice failed "
+          "(probably your lexicon has empty words or your LM has epsilon cycles; this "
+          " is a bad idea.)";
+    // (in phase where we get backward-costs).
+    fst::ILabelCompare<LatticeArc> ilabel_comp;
+    ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
+    // lattice-determinization more efficient.
+    
+    fst::DeterminizeLatticePrunedOptions lat_opts;
+    lat_opts.max_mem = config_.det_opts.max_mem;
+    
+    DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
+    raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
+    Connect(ofst); // Remove unreachable states... there might be
+    // a small number of these, in some cases.
+    return true;
+  }
+  
+ private:
+  inline PairId ConstructPair(StateId fst_state, StateId lm_state) {
+    return static_cast<PairId>(fst_state) + (static_cast<PairId>(lm_state) << 32);
+  }
+  
+  static inline StateId PairToState(PairId state_pair) {
+    return static_cast<StateId>(static_cast<uint32>(state_pair));
+  }
+  static inline StateId PairToLmState(PairId state_pair) {
+    return static_cast<StateId>(static_cast<uint32>(state_pair >> 32));
+  }
+  
+  struct Token;
+  // ForwardLinks are the links from a token to a token on the next frame.
+  // or sometimes on the current frame (for input-epsilon links).
+  struct ForwardLink {
+    Token *next_tok; // the next token [or NULL if represents final-state]
+    Label ilabel; // ilabel on link.
+    Label olabel; // olabel on link.
+    BaseFloat graph_cost; // graph cost of traversing link (contains LM, etc.)
+    BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing link
+    ForwardLink *next; // next in singly-linked list of forward links from a
+                       // token.
+    inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
+                       BaseFloat graph_cost, BaseFloat acoustic_cost, 
+                       ForwardLink *next):
+        next_tok(next_tok), ilabel(ilabel), olabel(olabel),
+        graph_cost(graph_cost), acoustic_cost(acoustic_cost), 
+        next(next) { }
+  };  
+  
+  // Token is what's resident in a particular state at a particular time.
+  // In this decoder a Token actually contains *forward* links.
+  // When first created, a Token just has the (total) cost.    We add forward
+  // links to it when we process the next frame.
+  struct Token {
+    BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
+    BaseFloat extra_cost; // >= 0.  After calling PruneForwardLinks, this equals
+    // the minimum difference between the cost of the best path, and the cost of
+    // this is on, and the cost of the absolute best path, under the assumption
+    // that any of the currently active states at the decoding front may
+    // eventually succeed (e.g. if you were to take the currently active states
+    // one by one and compute this difference, and then take the minimum).
+    
+    ForwardLink *links; // Head of singly linked list of ForwardLinks
+    
+    Token *next; // Next in list of tokens for this frame.
+    
+    inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
+                 Token *next): tot_cost(tot_cost), extra_cost(extra_cost),
+                 links(links), next(next) { }
+    inline void DeleteForwardLinks() {
+      ForwardLink *l = links, *m; 
+      while (l != NULL) {
+        m = l->next;
+        delete l;
+        l = m;
+      }
+      links = NULL;
+    }
+  };
+  
+  // head and tail of per-frame list of Tokens (list is in topological order),
+  // and something saying whether we ever pruned it using PruneForwardLinks.
+  struct TokenList {
+    Token *toks;
+    bool must_prune_forward_links;
+    bool must_prune_tokens;
+    TokenList(): toks(NULL), must_prune_forward_links(true),
+                 must_prune_tokens(true) { }
+  };
+
+  typedef HashList<PairId, Token*>::Elem Elem;
+  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
+  
+  void PossiblyResizeHash(size_t num_toks) {
+    size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
+                                        * config_.hash_ratio);
+    if (new_sz > toks_.Size()) {
+      toks_.SetSize(new_sz);
+    }
+    if (new_sz > toks_g1.Size()) {
+      toks_g1.SetSize(new_sz);
+    }
+  }
+
+  // FindOrAddToken either locates a token in hash of toks_,
+  // or if necessary inserts a new, empty token (i.e. with no forward links)
+  // for the current frame.  [note: it's inserted if necessary into hash toks_
+  // and also into the singly linked list of tokens active on this frame
+  // (whose head is at active_toks_[frame]).
+  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Token *&toks = active_toks_[frame].toks;
+    Elem *e_found = toks_.Find(state_pair);
+    if (e_found == NULL) { // no such token presently.
+      const BaseFloat extra_cost = 0.0;
+      // tokens on the currently final frame have zero extra_cost
+      // as any of them could end up
+      // on the winning path.
+      Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks);
+      // NULL: no forward links yet
+      toks = new_tok;
+      num_toks_++;
+      toks_.Insert(state_pair, new_tok);
+      if (changed) *changed = true;
+      return new_tok;
+    } else {
+      Token *tok = e_found->val; // There is an existing Token for this state.
+      if (tok->tot_cost > tot_cost) { // replace old token
+        tok->tot_cost = tot_cost;
+        // we don't allocate a new token, the old stays linked in active_toks_
+        // we only replace the tot_cost
+        // in the current frame, there are no forward links (and no extra_cost)
+        // only in ProcessNonemitting we have to delete forward links
+        // in case we visit a state for the second time
+        // those forward links, that lead to this replaced token before:
+        // they remain and will hopefully be pruned later (PruneForwardLinks...)
+        if (changed) *changed = true;
+      } else {
+        if (changed) *changed = false;
+      }
+      return tok;
+    }
+  }
+#define res_beam 1
+   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed, bool pp) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Elem_g1 *e_found = toks_g1.Find(state_id);
+    if (e_found == NULL) { // no such token presently.
+      toks_g1.Insert(state_id, tot_cost);
+      return true;
+    } else {
+      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
+        if (tot_cost < e_found->val)
+          e_found->val = tot_cost;
+        return true;
+      }
+      else if (pp) {
+        return false;
+      }
+      else {
+        return true;
+      }
+    }
+   }
+ 
+  // prunes outgoing links for all tokens in active_toks_[frame]
+  // it's called by PruneActiveTokens
+  // all links, that have link_extra_cost > lattice_beam are pruned
+  void PruneForwardLinks(int32 frame, bool *extra_costs_changed,
+                         bool *links_pruned,
+                         BaseFloat delta) {
+    // delta is the amount by which the extra_costs must change
+    // If delta is larger,  we'll tend to go back less far
+    //    toward the beginning of the file.
+    // extra_costs_changed is set to true if extra_cost was changed for any token
+    // links_pruned is set to true if any link in any token was pruned
+
+    *extra_costs_changed = false;
+    *links_pruned = false;
+    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
+    if (active_toks_[frame].toks == NULL ) { // empty list; should not happen.
+      if (!warned_) {
+        KALDI_WARN << "No tokens alive [doing pruning].. warning first "
+            "time only for each utterance\n";
+        warned_ = true;
+      }
+    }
+    
+    // We have to iterate until there is no more change, because the links
+    // are not guaranteed to be in topological order.
+    bool changed = true; // difference new minus old extra cost >= delta ?
+    while (changed) {
+      changed = false;
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        ForwardLink *link, *prev_link=NULL;
+        // will recompute tok_extra_cost for tok.
+        BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+        // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
+        for (link = tok->links; link != NULL; ) {
+          // See if we need to excise this link...
+          Token *next_tok = link->next_tok;
+          BaseFloat link_extra_cost = next_tok->extra_cost +
+              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+               - next_tok->tot_cost); // difference in brackets is >= 0
+          // link_exta_cost is the difference in score between the best paths
+          // through link source state and through link destination state
+          KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN
+          if (link_extra_cost > config_.lattice_beam) { // excise link
+            ForwardLink *next_link = link->next;
+            if (prev_link != NULL) prev_link->next = next_link;
+            else tok->links = next_link;
+            delete link;
+            link = next_link; // advance link but leave prev_link the same.
+            *links_pruned = true;
+          } else { // keep the link and update the tok_extra_cost if needed.
+            if (link_extra_cost < 0.0) { // this is just a precaution.
+              if (link_extra_cost < -0.01)
+                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+              link_extra_cost = 0.0;
+            }
+            if (link_extra_cost < tok_extra_cost)
+              tok_extra_cost = link_extra_cost;
+            prev_link = link; // move to next link
+            link = link->next;
+          }
+        } // for all outgoing links
+        if (fabs(tok_extra_cost - tok->extra_cost) > delta)
+          changed = true;  // difference new minus old is bigger than delta
+        tok->extra_cost = tok_extra_cost;
+        // will be +infinity or <= lattice_beam_.
+        // infinity indicates, that no forward link survived pruning
+      } // for all Token on active_toks_[frame]
+      if (changed) *extra_costs_changed = true;
+
+      // Note: it's theoretically possible that aggressive compiler
+      // optimizations could cause an infinite loop here for small delta and
+      // high-dynamic-range scores.
+    } // while changed
+  }
+
+  // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
+  // on the final frame.  If there are final tokens active, it uses
+  // the final-probs for pruning, otherwise it treats all tokens as final.
+  void PruneForwardLinksFinal(int32 frame) {
+    KALDI_ASSERT(static_cast<size_t>(frame+1) == active_toks_.size());
+    if (active_toks_[frame].toks == NULL ) // empty list; should not happen.
+      KALDI_WARN << "No tokens alive at end of file\n";
+
+    // First go through, working out the best token (do it in parallel
+    // including final-probs and not including final-probs; we'll take
+    // the one with final-probs if it's valid).
+    const BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
+    BaseFloat best_cost_final = infinity,
+        best_cost_nofinal = infinity;
+    unordered_map<Token*, BaseFloat> tok_to_final_cost;
+    Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
+    for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
+      PairId state_pair = e->key;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      Token *tok = e->val;
+      BaseFloat final_cost = fst_.Final(state).Value() +
+          lm_diff_fst_->Final(lm_state).Value();
+      tok_to_final_cost[tok] = final_cost;
+      best_cost_final = std::min(best_cost_final, tok->tot_cost + final_cost);
+      best_cost_nofinal = std::min(best_cost_nofinal, tok->tot_cost);
+      e_tail = e->tail;
+      toks_.Delete(e);
+    }
+    final_active_ = (best_cost_final != infinity);
+    
+    // Now go through tokens on this frame, pruning forward links...  may have
+    // to iterate a few times until there is no more change, because the list is
+    // not in topological order.
+
+    bool changed = true;
+    BaseFloat delta = 1.0e-05;
+    while (changed) {
+      changed = false;
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        ForwardLink *link, *prev_link=NULL;
+        // will recompute tok_extra_cost.  It has a term in it that corresponds
+        // to the "final-prob", so instead of initializing tok_extra_cost to infinity
+        // below we set it to the difference between the (score+final_prob) of this token,
+        // and the best such (score+final_prob).
+        BaseFloat tok_extra_cost;
+        if (final_active_) {
+          BaseFloat final_cost = tok_to_final_cost[tok];
+          tok_extra_cost = (tok->tot_cost + final_cost) - best_cost_final;
+        } else 
+          tok_extra_cost = tok->tot_cost - best_cost_nofinal;
+      
+        for (link = tok->links; link != NULL; ) {
+          // See if we need to excise this link...
+          Token *next_tok = link->next_tok;
+          BaseFloat link_extra_cost = next_tok->extra_cost +
+              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+               - next_tok->tot_cost);
+          if (link_extra_cost > config_.lattice_beam) { // excise link
+            ForwardLink *next_link = link->next;
+            if (prev_link != NULL) prev_link->next = next_link;
+            else tok->links = next_link;
+            delete link;
+            link = next_link; // advance link but leave prev_link the same.
+          } else { // keep the link and update the tok_extra_cost if needed.
+            if (link_extra_cost < 0.0) { // this is just a precaution.
+              if (link_extra_cost < -0.01)
+                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+              link_extra_cost = 0.0;
+            }
+            if (link_extra_cost < tok_extra_cost)
+              tok_extra_cost = link_extra_cost;
+            prev_link = link;
+            link = link->next;
+          }
+        }
+        // prune away tokens worse than lattice_beam above best path.  This step
+        // was not necessary in the non-final case because then, this case
+        // showed up as having no forward links.  Here, the tok_extra_cost has
+        // an extra component relating to the final-prob.
+        if (tok_extra_cost > config_.lattice_beam)
+          tok_extra_cost = infinity;
+          // to be pruned in PruneTokensForFrame
+
+        if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta))
+          changed = true;
+        tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
+      }
+    } // while changed
+
+    // Now put surviving Tokens in the final_costs_ hash, which is a class
+    // member (unlike tok_to_final_costs).
+    for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {    
+      if (tok->extra_cost != infinity) {
+        // If the token was not pruned away, 
+        if (final_active_) {
+          BaseFloat final_cost = tok_to_final_cost[tok];         
+          if (final_cost != infinity)
+            final_costs_[tok] = final_cost;
+        } else {
+          final_costs_[tok] = 0;
+        }
+      }
+    }
+  }
+  
+  // Prune away any tokens on this frame that have no forward links.
+  // [we don't do this in PruneForwardLinks because it would give us
+  // a problem with dangling pointers].
+  // It's called by PruneActiveTokens if any forward links have been pruned
+  void PruneTokensForFrame(int32 frame) {
+    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
+    Token *&toks = active_toks_[frame].toks;
+    if (toks == NULL)
+      KALDI_WARN << "No tokens alive [doing pruning]\n";
+    Token *tok, *next_tok, *prev_tok = NULL;
+    for (tok = toks; tok != NULL; tok = next_tok) {
+      next_tok = tok->next;
+      if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
+        // token is unreachable from end of graph; (no forward links survived)
+        // excise tok from list and delete tok.
+        if (prev_tok != NULL) prev_tok->next = tok->next;
+        else toks = tok->next;
+        delete tok;
+        num_toks_--;
+      } else { // fetch next Token
+        prev_tok = tok;
+      }
+    }
+  }
+  
+  // Go backwards through still-alive tokens, pruning them.  note: cur_frame is
+  // where hash toks_ are (so we do not want to mess with it because these tokens
+  // don't yet have forward pointers), but we do all previous frames, unless we
+  // know that we can safely ignore them because the frame after them was unchanged.
+  // delta controls when it considers a cost to have changed enough to continue
+  // going backward and propagating the change.
+  // for a larger delta, we will recurse less far back
+  void PruneActiveTokens(int32 cur_frame, BaseFloat delta) {
+    int32 num_toks_begin = num_toks_;
+    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
+      // Reason why we need to prune forward links in this situation:
+      // (1) we have never pruned them (new TokenList)
+      // (2) we have not yet pruned the forward links to the next frame,
+      // after any of those tokens have changed their extra_cost.
+      if (active_toks_[frame].must_prune_forward_links) {
+        bool extra_costs_changed = false, links_pruned = false;
+        PruneForwardLinks(frame, &extra_costs_changed, &links_pruned, delta);
+        if (extra_costs_changed && frame > 0) // any token has changed extra_cost
+          active_toks_[frame-1].must_prune_forward_links = true;
+        if (links_pruned) // any link was pruned
+          active_toks_[frame].must_prune_tokens = true;
+        active_toks_[frame].must_prune_forward_links = false; // job done
+      }
+      if (frame+1 < cur_frame &&      // except for last frame (no forward links)
+         active_toks_[frame+1].must_prune_tokens) {
+        PruneTokensForFrame(frame+1);
+        active_toks_[frame+1].must_prune_tokens = false;
+      }
+    }
+    KALDI_VLOG(3) << "PruneActiveTokens: pruned tokens from " << num_toks_begin
+                  << " to " << num_toks_;
+  }
+
+  // Version of PruneActiveTokens that we call on the final frame.
+  // Takes into account the final-prob of tokens.
+  void PruneActiveTokensFinal(int32 cur_frame) {
+    // returns true if there were final states active
+    // else returns false and treats all states as final while doing the pruning
+    // (this can be useful if you want partial lattice output,
+    // although it can be dangerous, depending what you want the lattices for).
+    // final_active_ and final_probs_ (a hash) are set internally
+    // by PruneForwardLinksFinal
+    int32 num_toks_begin = num_toks_;
+    PruneForwardLinksFinal(cur_frame); // prune final frame (with final-probs)
+    // sets final_active_ and final_probs_
+    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
+      bool b1, b2; // values not used.
+      BaseFloat dontcare = 0.0; // delta of zero means we must always update
+      PruneForwardLinks(frame, &b1, &b2, dontcare);
+      PruneTokensForFrame(frame+1);
+    }
+    PruneTokensForFrame(0); 
+    KALDI_VLOG(3) << "PruneActiveTokensFinal: pruned tokens from " << num_toks_begin
+                  << " to " << num_toks_;
+  }
+  
+  /// Gets the weight cutoff.  Also counts the active tokens.
+  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
+                      BaseFloat *adaptive_beam, Elem **best_elem) {
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(),
+                       tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0) min_active_cutoff = best_weight;
+      else {
+        std::nth_element(tmp_array_.begin(),
+                         tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
+                         tmp_array_.begin() + config_.max_active :
+                         tmp_array_.end());
+        min_active_cutoff = tmp_array_[config_.min_active];
+      }
+    }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+  }
+  }
+
+  inline StateId PropagateLm(StateId lm_state,
+                             Arc *arc, bool *pp=NULL) { // returns new LM state.
+    if (arc->olabel == 0) {
+      if (pp) *pp=false;
+      return lm_state; // no change in LM state if no word crossed.
+    } else { // Propagate in the LM-diff FST.
+      if (pp) *pp=false;
+      Arc lm_arc;
+      bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
+      if (!ans) { // this case is unexpected for statistical LMs.
+        if (!warned_noarc_) {
+          warned_noarc_ = true;
+          KALDI_WARN << "No arc available in LM (unlikely to be correct "
+              "if a statistical language model); will not warn again";
+        }
+        arc->weight = Weight::Zero();
+        return lm_state; // doesn't really matter what we return here; will
+        // be pruned.
+      } else {
+        arc->weight = Times(arc->weight, lm_arc.weight);
+        arc->olabel = lm_arc.olabel; // probably will be the same.
+        return lm_arc.nextstate; // return the new LM state.
+      }      
+    }
+  }
+  
+  void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
+    // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
+    Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
+    Elem *best_elem = NULL;
+    BaseFloat adaptive_beam;
+    size_t tok_cnt;
+    BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
+    PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
+    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
+
+  
+    BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
+    // pruning "online" before having seen all tokens
+
+    // First process the best token to get a hopefully
+    // reasonably tight bound on the next cutoff.
+    if (best_elem) {
+      PairId state_pair = best_elem->key;
+      StateId state = PairToState(state_pair), // state in "fst"
+          lm_state = PairToLmState(state_pair);
+      Token *tok = best_elem->val;
+      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+           !aiter.Done();
+           aiter.Next()) {
+        Arc arc = aiter.Value();
+        if (arc.ilabel != 0) {  // propagate..
+          PropagateLm(lm_state, &arc); // may affect "arc.weight".
+          // We don't need the return value (the new LM state).
+          arc.weight = Times(arc.weight,
+                             Weight(-decodable->LogLikelihood(frame-1, arc.ilabel)));
+          BaseFloat new_weight = arc.weight.Value() + tok->tot_cost;
+          if (new_weight + adaptive_beam < next_cutoff)
+            next_cutoff = new_weight + adaptive_beam;
+        }
+      }
+    }
+    
+    // the tokens are now owned here, in last_toks, and the hash is empty.
+    // 'owned' is a complex thing here; the point is we need to call DeleteElem
+    // on each elem 'e' to let toks_ know we're done with them.
+    for (Elem *e = last_toks, *e_tail; e != NULL; e = e_tail) {
+      // loop this way because we delete "e" as we go.
+      PairId state_pair = e->key;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      Token *tok = e->val;
+      if (tok->tot_cost <=  cur_cutoff) {
+        for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+             !aiter.Done();
+             aiter.Next()) {
+          const Arc &arc_ref = aiter.Value();
+          if (arc_ref.ilabel != 0) {  // propagate..
+            Arc arc(arc_ref);
+            bool pp=arc.olabel>0;
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
+            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
+            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
+            BaseFloat graph_cost = arc.weight.Value(),
+                cur_cost = tok->tot_cost,
+                tot_cost = cur_cost + ac_cost + graph_cost;
+            if (tot_cost > next_cutoff) continue;
+            else if (tot_cost + config_.beam < next_cutoff)
+              next_cutoff = tot_cost + config_.beam; // prune by best current token
+            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
+            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
+            // true: emitting, NULL: no change indicator needed
+          
+            // Add ForwardLink from tok to next_tok (put on head of list tok->links)
+            tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+                                         graph_cost, ac_cost, tok->links);
+          }
+        } // for all arcs
+      }
+      e_tail = e->tail;
+      toks_.Delete(e); // delete Elem
+    }
+  }
+
+  void ProcessNonemitting(int32 frame) {
+    // note: "frame" is the same as emitting states just processed.
+    
+    // Processes nonemitting arcs for one frame.  Propagates within toks_.
+    // Note-- this queue structure is is not very optimal as
+    // it may cause us to process states unnecessarily (e.g. more than once),
+    // but in the baseline code, turning this vector into a set to fix this
+    // problem did not improve overall speed.
+
+    KALDI_ASSERT(queue_.empty());
+    BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+    for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
+      queue_.push_back(e->key);
+      // for pruning with current best token
+      best_cost = std::min(best_cost, static_cast<BaseFloat>(e->val->tot_cost));
+    }
+    if (queue_.empty()) {
+      if (!warned_) {
+        KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
+                  << frame;
+        warned_ = true;
+      }
+    }
+    BaseFloat cutoff = best_cost + config_.beam;
+    
+    while (!queue_.empty()) {
+      PairId state_pair = queue_.back();
+      queue_.pop_back();
+
+      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not in
+                                                 // toks_ but this can't happen.
+      BaseFloat cur_cost = tok->tot_cost;
+      if (cur_cost > cutoff) // Don't bother processing successors.
+        continue;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      // If "tok" has any existing forward links, delete them,
+      // because we're about to regenerate them.  This is a kind
+      // of non-optimality (remember, this is the simple decoder),
+      // but since most states are emitting it's not a huge issue.
+      tok->DeleteForwardLinks(); // necessary when re-visiting
+      tok->links = NULL;
+      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+          !aiter.Done();
+          aiter.Next()) {
+        const Arc &arc_ref = aiter.Value();
+        if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
+          Arc arc(arc_ref);
+          bool pp=arc.olabel>0;
+          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
+          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
+          BaseFloat graph_cost = arc.weight.Value(),
+              tot_cost = cur_cost + graph_cost;
+          if (tot_cost < cutoff) {
+            bool changed;
+            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
+            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
+                                            false, &changed); // false: non-emit
+            
+            tok->links = new ForwardLink(new_tok, 0, arc.olabel,
+                                         graph_cost, 0, tok->links);
+            
+            // "changed" tells us whether the new token has a different
+            // cost from before, or is new [if so, add into queue].
+            if (changed) queue_.push_back(next_pair);
+          }
+        }
+      } // for all arcs
+    } // while queue not empty
+  }
+
+
+  // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
+  // more than one list (e.g. for current and previous frames), but only one of
+  // them at a time can be indexed by StateId.
+  HashList<PairId, Token*> toks_;
+  HashList<StateId, BaseFloat> toks_g1;
+  std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
+  // frame (members of TokenList are toks, must_prune_forward_links,
+  // must_prune_tokens).
+  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
+  // make it class member to avoid internal new/delete.
+  const fst::Fst<fst::StdArc> &fst_;
+  fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst_;  
+  LatticeBiglmFasterDecoderConfig config_;
+  bool warned_noarc_;  
+  int32 num_toks_; // current total #toks allocated...
+  bool warned_;
+  bool final_active_; // use this to say whether we found active final tokens
+  // on the last frame.
+  std::map<Token*, BaseFloat> final_costs_; // A cache of final-costs
+  // of tokens on the last frame-- it's just convenient to store it this way.
+  
+  // It might seem unclear why we call DeleteElems(toks_.Clear()).
+  // There are two separate cleanup tasks we need to do at when we start a new file.
+  // one is to delete the Token objects in the list; the other is to delete
+  // the Elem objects.  toks_.Clear() just clears them from the hash and gives ownership
+  // to the caller, who then has to call toks_.Delete(e) for each one.  It was designed
+  // this way for convenience in propagating tokens from one frame to the next.
+  void DeleteElems(Elem *list) {
+    for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_.Delete(e);
+    }
+    toks_.Clear();
+    DeleteElems_1(toks_g1.Clear());
+  }
+  void DeleteElems_1(Elem_g1 *list) {
+    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_g1.Delete(e);
+    }
+    toks_g1.Clear();
+  }
+  
+  void ClearActiveTokens() { // a cleanup routine, at utt end/begin
+    for (size_t i = 0; i < active_toks_.size(); i++) {
+      // Delete all tokens alive on this frame, and any forward
+      // links they may have.
+      for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
+        tok->DeleteForwardLinks();
+        Token *next_tok = tok->next;
+        delete tok;
+        num_toks_--;
+        tok = next_tok;
+      }
+    }
+    active_toks_.clear();
+    KALDI_ASSERT(num_toks_ == 0);
+  }
+};
+
+} // end namespace kaldi.
+
+#endif

From 0c94f2eaa55487267cb6c91ac00a3b7154283bf8 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 17:25:49 -0700
Subject: [PATCH 27/93] remove map in LM, comes to 1.30;
 exp_dec/fasterlm.1b2/dec.log

---
 .../lattice-otfres-biglm-faster-decoder.h     |  2 +-
 src/lm/faster-arpa-lm.h                       | 92 +++++++++----------
 2 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/src/decoder/lattice-otfres-biglm-faster-decoder.h b/src/decoder/lattice-otfres-biglm-faster-decoder.h
index 841547b9cca..ac682024ccb 100644
--- a/src/decoder/lattice-otfres-biglm-faster-decoder.h
+++ b/src/decoder/lattice-otfres-biglm-faster-decoder.h
@@ -355,7 +355,7 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-#define res_beam 1
+#define res_beam 0.5
    inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed, bool pp) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index fab511c0ea5..808d56008b8 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -47,7 +47,7 @@ class FasterArpaLm {
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState(): logprob_(0), h_value(0), next(NULL) { }
+    LmState(): logprob_(0), h_value(0), word_ids_(NULL), next(NULL) { }
     LmState(float logprob, float backoff_logprob): 
       logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
     next(NULL) { }
@@ -60,14 +60,21 @@ class FasterArpaLm {
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
+    void SaveWordIds(const int32 *word_ids, const int32 ngram_order) {
+      word_ids_ = (int32 *)malloc(sizeof(int32)*ngram_order);
+      for (int i=0; i<ngram_order; i++) word_ids_[i] = word_ids[i];
+      ngram_order_ = ngram_order;
+    }
     bool IsExist() const { return logprob_!=0; }
-    ~LmState() { }
+    ~LmState() { if (word_ids_) free(word_ids_); }
 
     // for current query
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
     RAND_TYPE h_value;
+    int32 *word_ids_;
+    int32 ngram_order_;
     LmState* next; // for colid
   };
 
@@ -164,8 +171,10 @@ class FasterArpaLm {
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
       ngrams_[hashed_idx] = lm_state_pattern;
+      ngrams_[hashed_idx].SaveWordIds(word_ids, ngram_order);
     } else {
       ngrams_[ngrams_saved_num_] = lm_state_pattern;
+      ngrams_[ngrams_saved_num_].SaveWordIds(word_ids, ngram_order);
       InsertHash(hashed_idx, ngrams_saved_num_++);
     }
   }
@@ -183,24 +192,27 @@ class FasterArpaLm {
 
 
   inline const LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order) const {
+      int query_ngram_order, int32 *lm_state_idx=NULL) const {
     RAND_TYPE h_value;
+    LmState *ret_lm_state = NULL;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
-      return &ngrams_[hashed_idx];
+      ret_lm_state = &ngrams_[hashed_idx];
     } else {
       LmState *lm_state = ngrams_map_[hashed_idx];
       while (lm_state) {
         if (lm_state->h_value == h_value) {
-          return lm_state;
+          ret_lm_state = lm_state;
+          break;
         }
         lm_state = lm_state->next;
       }
     }
+    if (ret_lm_state && lm_state_idx) *lm_state_idx = ret_lm_state - ngrams_;
    
     // not found, can be bug or really not found the corresponding ngram 
-    return NULL;
+    return ret_lm_state;
   }
   inline const LmState* GetHashedState(const std::vector<int32> &word_ids, 
        bool reverse = false, int query_ngram_order = 0) const {
@@ -215,9 +227,15 @@ class FasterArpaLm {
 
   // if exist, get logprob_, else get backoff_logprob_
   // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+  inline void GetWordIdsByLmStateIdx(int32 **word_ids, 
+      int32 *word_ngram_order, int32 lm_state_idx) const {
+    *word_ids = ngrams_[lm_state_idx].word_ids_;
+    *word_ngram_order = ngrams_[lm_state_idx].ngram_order_;
+  }
+
   inline float GetNgramLogprob(const int32 *word_ids, 
       const int32 word_ngram_order, 
-      std::vector<int32>& o_word_ids) const {
+      int32 *lm_state_idx) const {
     float prob;
     int32 ngram_order = word_ngram_order;
     assert(ngram_order > 0);
@@ -245,14 +263,8 @@ class FasterArpaLm {
   */    
       // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
       ngram_order = std::min(ngram_order,ngram_order_-1);
-      while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
+      while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
       assert(ngram_order>0);
-
-      o_word_ids.resize(ngram_order);
-      for (int i=0; i<ngram_order; i++) {
-        o_word_ids[i] = word_ids[i]; 
-      }
-
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
@@ -260,7 +272,7 @@ class FasterArpaLm {
       //assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 84746 4447 8537 without 4447 8537 in LM
 
       prob = lm_state_bo? lm_state_bo->backoff_logprob_:0;
-      prob += GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+      prob += GetNgramLogprob(word_ids, ngram_order - 1, lm_state_idx);
     }
     return prob;
   }
@@ -343,8 +355,6 @@ class FasterArpaLm {
   // Size of the <lm_states_> array, which will be needed by I/O.
   int64 lm_states_size_;
   // Hash table from word sequences to LmStates.
-  unordered_map<std::vector<int32>,
-                LmState*, VectorHasher<int32> > seq_to_state_;
   ArpaParseOptions &options_;
 
   // data
@@ -376,10 +386,10 @@ class FasterArpaLmDeterministicFst
 
   explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
     start_state_(0), lm_(lm) { 
+      // TODO
     // Creates a history state for <s>.
-    std::vector<Label> bos_state(1, lm_.BosSymbol());
-    state_to_wseq_.push_back(bos_state);
-    wseq_to_state_[bos_state] = 0;
+    int32 word_ids = lm_.BosSymbol();
+    lm_.GetNgramLogprob(&word_ids, 1, &start_state_);
   }
 
   // We cannot use "const" because the pure virtual function in the interface is
@@ -390,16 +400,18 @@ class FasterArpaLmDeterministicFst
   // not const.
   virtual Weight Final(StateId s) {
     // At this point, we should have created the state.
-    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
-    const std::vector<Label>& wseq = state_to_wseq_[s];
-    std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, lm_.EosSymbol(), owseq);
+    int32 lm_state_idx;
+    float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
     return Weight(-logprob);
   }
 
-  float GetNgramLogprob(const std::vector<int32> &wseq, int32 ilabel,
-    std::vector<int32> &owseq) {
-    int32 n = wseq.size();
+  float GetNgramLogprob(const int32 pre_lm_state_idx, int32 ilabel,
+      int32 *lm_state_idx) {
+    int32 *wseq;
+    int32 wseq_order;
+    lm_.GetWordIdsByLmStateIdx(&wseq, &wseq_order, pre_lm_state_idx);
+    int32 n = wseq_order;
+    assert(n>0);
     int32 word_ids[MAX_NGRAM];
     assert(n+1 <= MAX_NGRAM);
 
@@ -408,46 +420,28 @@ class FasterArpaLmDeterministicFst
       word_ids[i+1] = wseq[i];
     }
 
-    return lm_.GetNgramLogprob(word_ids, n+1, owseq);
+    return lm_.GetNgramLogprob(word_ids, n+1, lm_state_idx);
   }
   virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
     // At this point, we should have created the state.
-    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
 
-    std::vector<Label> wseq = state_to_wseq_[s];
-    std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    int32 lm_state_idx;
+    float logprob = GetNgramLogprob(s, ilabel, &lm_state_idx);
     if (logprob == std::numeric_limits<float>::min()) {
       return false;
     }
 
-    std::pair<const std::vector<Label>, StateId> wseq_state_pair(
-        owseq, static_cast<Label>(state_to_wseq_.size()));
-
-    // Attemps to insert the current <wseq_state_pair>. If the pair already exists
-    // then it returns false.
-    typedef MapType::iterator IterType;
-    std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
-
-    // If the pair was just inserted, then also add it to <state_to_wseq_>.
-    if (result.second == true)
-      state_to_wseq_.push_back(owseq);
-
     // Creates the arc.
     oarc->ilabel = ilabel;
     oarc->olabel = ilabel;
-    oarc->nextstate = result.first->second;
+    oarc->nextstate = lm_state_idx;
     oarc->weight = Weight(-logprob);
 
     return true;
   }
 
  private:
-  typedef unordered_map<std::vector<Label>,
-                        StateId, VectorHasher<Label> > MapType;
   StateId start_state_;
-  MapType wseq_to_state_;
-  std::vector<std::vector<Label> > state_to_wseq_;
 
   const FasterArpaLm& lm_;
 };

From bf701b53421681bfbc5de8fd75986927a0907354 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 17:45:53 -0700
Subject: [PATCH 28/93] mod timer

---
 src/bin/latgen-fasterlm-faster-mapped.cc        | 4 ++--
 src/bin/latgen-otfres-fasterlm-faster-mapped.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index be2fce3e01d..ebdbe024ad0 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     kaldi::int64 frame_count = 0;
     int num_success = 0, num_fail = 0;
 
-
+    double elapsed=0;
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
       // Input FST is just one FST, not a table of FSTs.
@@ -279,13 +279,13 @@ int main(int argc, char *argv[]) {
             num_success++;
           } else num_fail++;
         }
+        elapsed = timer.Elapsed();
       }
       delete decode_fst; // delete this only after decoder goes out of scope.
     } else { // We have different FSTs for different utterances.
       assert(0);
     }
       
-    double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);
diff --git a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
index ad475f9405f..9f37cf8a331 100644
--- a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     kaldi::int64 frame_count = 0;
     int num_success = 0, num_fail = 0;
 
-
+    double elapsed=0;
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
       // Input FST is just one FST, not a table of FSTs.
@@ -279,13 +279,13 @@ int main(int argc, char *argv[]) {
             num_success++;
           } else num_fail++;
         }
+        elapsed = timer.Elapsed();
       }
       delete decode_fst; // delete this only after decoder goes out of scope.
     } else { // We have different FSTs for different utterances.
       assert(0);
     }
       
-    double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);

From 41a9a9058b26cf1312923fd548c9bbf265e12e73 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 18:35:23 -0700
Subject: [PATCH 29/93] tiny

---
 src/lm/faster-arpa-lm.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 808d56008b8..f796478a9f6 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -249,7 +249,7 @@ class FasterArpaLm {
       ngram_order = ngram_order_;
     }
 
-    const LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    const LmState *lm_state = GetHashedState(word_ids, ngram_order, lm_state_idx);
     if (lm_state) { //found out
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
@@ -260,11 +260,17 @@ class FasterArpaLm {
         std::cout<<word_ids[i]<<" ";
       }
       std::cout<<ngram_order<<" "<<prob<<"\n";
-  */    
-      // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
-      ngram_order = std::min(ngram_order,ngram_order_-1);
-      while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
-      assert(ngram_order>0);
+  */   
+#define IMPROVE_RECOMBINE
+#ifdef IMPROVE_RECOMBINE
+      if (ngram_order > ngram_order_-1) {
+        ngram_order = ngram_order_-1;
+        // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state; 
+        // however, it wastes some hashing if we never use the nextstate
+        while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
+        assert(ngram_order>0);
+      }
+#endif
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 

From 5fb7d2542a674308358fb3870a1bb5d0c8404cf9 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 15 Apr 2018 12:40:46 -0700
Subject: [PATCH 30/93] add rescore tool; script/speedup_reslat.1b.sh; if [ 
 -le 4 ]; then       steps/lmrescore_fasterlm_arpa.sh --cmd run.pl        
 data/lang_test_tgsmall data/local/lm/3-gram.pruned.1e-7.arpa.gz        
 /decode_{tgsmall,tgmed}_ fi

---
 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh   |  65 +++++++++
 src/latbin/Makefile                           |   2 +
 src/latbin/lattice-lmrescore-fasterlm-arpa.cc | 135 ++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
 create mode 100644 src/latbin/lattice-lmrescore-fasterlm-arpa.cc

diff --git a/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh b/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
new file mode 100644
index 00000000000..941cd90664d
--- /dev/null
+++ b/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2014  Guoguo Chen
+# Apache 2.0
+
+# This script rescores lattices with the ConstArpaLm format language model.
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+stage=1
+scoring_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+newlang=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+#newlm=$newlang/G.carpa
+newlm="gunzip -c $newlang| utils/map_arpa_lm.pl $oldlang/words.txt |"
+[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+! ls $indir/lat.*.gz >/dev/null &&\
+  echo "$0: No lattices input directory $indir" && exit 1;
+
+
+oldlmcommand="fstproject --project_output=true $oldlm |"
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=-1.0 \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
+    lattice-lmrescore-fasterlm-arpa --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3   --lm-scale=1.0 ark:- "$newlm" \
+     "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring && [ $stage -le 2 ]; then
+  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $oldlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index bcffbb43168..813986b63ec 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -27,6 +27,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned
 
+BINFILES += lattice-lmrescore-fasterlm-arpa
+
 OBJFILES =
 
 
diff --git a/src/latbin/lattice-lmrescore-fasterlm-arpa.cc b/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
new file mode 100644
index 00000000000..85679dcd15b
--- /dev/null
+++ b/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
@@ -0,0 +1,135 @@
+// latbin/lattice-lmrescore-const-arpa.cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lm/faster-arpa-lm.h"
+#include "util/common-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Rescores lattice with the ConstArpaLm format language model. The LM\n"
+        "will be wrapped into the DeterministicOnDemandFst interface and the\n"
+        "rescoring is done by composing with the wrapped LM using a special\n"
+        "type of composition algorithm. Determinization will be applied on\n"
+        "the composed lattice.\n"
+        "\n"
+        "Usage: lattice-lmrescore-const-arpa [options] lattice-rspecifier \\\n"
+        "                                   const-arpa-in lattice-wspecifier\n"
+        " e.g.: lattice-lmrescore-const-arpa --lm-scale=-1.0 ark:in.lats \\\n"
+        "                                   const_arpa ark:out.lats\n";
+
+    ParseOptions po(usage);
+    BaseFloat lm_scale = 1.0;
+    int32 symbol_size = 0;
+
+    po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
+                "costs; frequently 1.0 or -1.0");
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier = po.GetArg(1),
+        lm_rxfilename = po.GetArg(2),
+        lats_wspecifier = po.GetArg(3);
+
+    // Reads the language model in FasterArpaLm format.
+    FasterArpaLm new_lm(arpa_options, lm_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst const_arpa_fst(new_lm);
+
+
+    // Reads and writes as compact lattice.
+    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
+    int32 n_done = 0, n_fail = 0;
+    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+      std::string key = compact_lattice_reader.Key();
+      CompactLattice clat = compact_lattice_reader.Value();
+      compact_lattice_reader.FreeCurrent();
+
+      if (lm_scale != 0.0) {
+        // Before composing with the LM FST, we scale the lattice weights
+        // by the inverse of "lm_scale".  We'll later scale by "lm_scale".
+        // We do it this way so we can determinize and it will give the
+        // right effect (taking the "best path" through the LM) regardless
+        // of the sign of lm_scale.
+        fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &clat);
+        ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>());
+
+        // Wraps the ConstArpaLm format language model into FST. We re-create it
+        // Composes lattice with language model.
+        CompactLattice composed_clat;
+        ComposeCompactLatticeDeterministic(clat,
+                                           &const_arpa_fst, &composed_clat);
+
+        // Determinizes the composed lattice.
+        Lattice composed_lat;
+        ConvertLattice(composed_clat, &composed_lat);
+        Invert(&composed_lat);
+        CompactLattice determinized_clat;
+        DeterminizeLattice(composed_lat, &determinized_clat);
+        fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat);
+        if (determinized_clat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key
+              << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          compact_lattice_writer.Write(key, determinized_clat);
+          n_done++;
+        }
+      } else {
+        // Zero scale so nothing to do.
+        n_done++;
+        compact_lattice_writer.Write(key, clat);
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 693f03fe1aa56d14d7d6f43905e546fc0d067dcb Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Wed, 25 Apr 2018 06:53:05 -0700
Subject: [PATCH 31/93] tunable HASH_SIZE

---
 src/lm/faster-arpa-lm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index f796478a9f6..c0026a5a51b 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -41,6 +41,7 @@ uint64  RandInt64() {
 }
 #define MAX_NGRAM 5+1
 #define RAND_TYPE int64
+#define HASH_REDUNDANT 0.5
 class FasterArpaLm {
  public:
 
@@ -308,7 +309,7 @@ class FasterArpaLm {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.5));
+                                 M_LN2 + HASH_REDUNDANT));
       }
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];

From 6e1cee2f1a6c7e3d2ea66fe1fa101a8364fbcc8c Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw213.cm.cluster>
Date: Wed, 25 Apr 2018 21:47:47 -0700
Subject: [PATCH 32/93] bug fix

---
 src/lm/faster-arpa-lm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index c0026a5a51b..6e5b5646402 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -50,7 +50,7 @@ class FasterArpaLm {
    public:
     LmState(): logprob_(0), h_value(0), word_ids_(NULL), next(NULL) { }
     LmState(float logprob, float backoff_logprob): 
-      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
+      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0), word_ids_(NULL),
     next(NULL) { }
     void Allocate(const NGram* ngram, float lm_scale=1) {
       logprob_ = ngram->logprob*lm_scale;
@@ -151,6 +151,7 @@ class FasterArpaLm {
     return hashed_idx;
   }
   inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
+    assert(hashed_idx < ngrams_map_.size());
     if (ngrams_map_.at(hashed_idx)) {
       LmState *lm_state = ngrams_map_[hashed_idx];
       int32 cnt=0;
@@ -201,6 +202,7 @@ class FasterArpaLm {
     if (ngram_order == 1) {
       ret_lm_state = &ngrams_[hashed_idx];
     } else {
+      assert(hashed_idx < ngrams_map_.size());
       LmState *lm_state = ngrams_map_[hashed_idx];
       while (lm_state) {
         if (lm_state->h_value == h_value) {

From 456446501db59705b362ac520f580afd3b5f66ce Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Thu, 26 Apr 2018 00:49:23 -0700
Subject: [PATCH 33/93] bug fix for ngrams

---
 src/lm/faster-arpa-lm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 6e5b5646402..5cbc985cdba 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -99,7 +99,7 @@ class FasterArpaLm {
       lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
-    virtual void ReadComplete()  { }
+    virtual void ReadComplete()  {  }
 
    private:
     FasterArpaLm *lm_;
@@ -117,6 +117,10 @@ class FasterArpaLm {
     max_collision_ = 0;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+    assert(ngrams_num_ >= ngrams_saved_num_);
+    if (ngrams_num_ != ngrams_saved_num_) {
+      KALDI_WARN << "num mismatch in arpa header: "<<ngrams_num_<<" "<<ngrams_saved_num_;
+    }
     KALDI_VLOG(2) << max_collision_;
   }
 
@@ -163,6 +167,7 @@ class FasterArpaLm {
       max_collision_=std::max(cnt,max_collision_);
     } else {
       ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
+      assert(ngrams_saved_num_ < ngrams_num_);
     }
   }
   inline void SaveHashedState(const int32* word_ids, 
@@ -319,7 +324,7 @@ class FasterArpaLm {
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = RandInt64(); 
       }
-      acc+= ngram_count[i];
+      acc+= i==0? ngrams_hashed_size_[i]:ngram_count[i];
       acc_hashed+= ngrams_hashed_size_[i];
       if (i==0) ngrams_hashed_size_[i]=0;
       else ngrams_hashed_size_[i]+=ngrams_hashed_size_[i-1];
@@ -329,7 +334,8 @@ class FasterArpaLm {
     KALDI_VLOG(2) << " hashed_size/size = "<< 
         1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
     
-    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructor
+    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
+    ngrams_num_ = acc;
     ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
     ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
@@ -371,6 +377,7 @@ class FasterArpaLm {
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
   LmState* ngrams_;
   int32 ngrams_saved_num_;
+  int32 ngrams_num_;
 
   std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]

From d22c79a4a0fc93b6d6aba6730cdba545cc32760c Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Wed, 2 May 2018 02:41:28 -0700
Subject: [PATCH 34/93] add recombine; no imp

---
 src/lm/faster-arpa-lm.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 5cbc985cdba..7af7e798016 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -77,6 +77,7 @@ class FasterArpaLm {
     int32 *word_ids_;
     int32 ngram_order_;
     LmState* next; // for colid
+    int32 lm_state_reco_;
   };
 
   // Class to build FasterArpaLm from Arpa format language model. It relies on the
@@ -99,7 +100,9 @@ class FasterArpaLm {
       lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
-    virtual void ReadComplete()  {  }
+    virtual void ReadComplete()  { 
+      lm_->SaveRecoState();
+    }
 
    private:
     FasterArpaLm *lm_;
@@ -197,6 +200,19 @@ class FasterArpaLm {
     return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
   }
 
+  inline void SaveRecoState() {
+    for (int i=0; i<ngrams_num_; i++) {
+      int32 *word_ids = ngrams_[i].word_ids_;
+      int32 ngram_order = ngrams_[i].ngram_order_;
+      int32 lm_state_idx;
+      if (ngram_order > ngram_order_-1) {
+        ngram_order--;
+        while(!GetHashedState(word_ids, ngram_order, &lm_state_idx)) ngram_order--;
+        assert(ngram_order>0);
+      } else lm_state_idx = i; 
+      ngrams_[i].lm_state_reco_ = lm_state_idx;
+    }
+  }
 
   inline const LmState* GetHashedState(const int32* word_ids, 
       int query_ngram_order, int32 *lm_state_idx=NULL) const {
@@ -271,6 +287,9 @@ class FasterArpaLm {
   */   
 #define IMPROVE_RECOMBINE
 #ifdef IMPROVE_RECOMBINE
+#if 1
+      *lm_state_idx = lm_state->lm_state_reco_;
+#else
       if (ngram_order > ngram_order_-1) {
         ngram_order = ngram_order_-1;
         // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state; 
@@ -278,6 +297,7 @@ class FasterArpaLm {
         while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
         assert(ngram_order>0);
       }
+#endif
 #endif
     } else {
       assert(ngram_order > 1); // thus we can do backoff

From b73089527f79e09b05adaed246a3f0fe5c58efe3 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c06.clsp.jhu.edu>
Date: Tue, 8 May 2018 09:08:32 -0400
Subject: [PATCH 35/93] tmp

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 63 +++++++++++++-----------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index ebdbe024ad0..2adc9668545 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -79,8 +79,9 @@ bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is rea
     num_frames = alignment.size();
     if (words_writer->IsOpen())
       words_writer->Write(utt, words);
-    if (alignment_writer->IsOpen())
-      alignment_writer->Write(utt, alignment);
+    assert(!alignment_writer);
+    //if (alignment_writer->IsOpen())
+    //  alignment_writer->Write(utt, alignment);
     if (word_syms != NULL) {
       std::cerr << utt << ' ';
       for (size_t i = 0; i < words.size(); i++) {
@@ -184,20 +185,23 @@ int main(int argc, char *argv[]) {
     
     po.Read(argc, argv);
 
-    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+    if (po.NumArgs() < 6 ) {
       po.PrintUsage();
       exit(1);
     }
-    
+   
+    int start_lm = 3;
+    int end_lm = po.NumArgs() - 3;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
-        old_lm_fst_rxfilename = po.GetArg(3),
-        new_lm_fst_rxfilename = po.GetArg(4),
-        feature_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-    
+        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
+        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
+        words_wspecifier = po.GetOptArg(po.NumArgs());
+ 
+    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
+    //old_lm_fst_rxfilename = po.GetArg(3),
+    //new_lm_fst_rxfilename = po.GetArg(4),   
+
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
@@ -207,22 +211,23 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-#if 1
-    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-#else
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
-    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-#endif
-
-    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
-    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
-
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+    std::vector<FasterArpaLm> lm_vec;
+    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
+    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    for ( int i = start_lm; i < end_lm; i+=2 ) {
+      std::string s_lm = po.GetArg(i);
+      float w =  atof(po.GetArg(i+1).c_str());
+      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
+      dlm_vec.emplace_back(lm_vec.back());
+      if (i == start_lm) continue;
+      else if (i == start_lm+2) {
+        clm_vec.emplace_back(&dlm_vec.at(dlm_vec.size()-2),&dlm_vec.back());
+      } else {
+        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
+      }
+    }
+    // multiple compose
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e7);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
@@ -234,7 +239,7 @@ int main(int argc, char *argv[]) {
 
     Int32VectorWriter words_writer(words_wspecifier);
 
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
+    //Int32VectorWriter alignment_writer(alignment_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_filename != "") 
@@ -271,7 +276,7 @@ int main(int argc, char *argv[]) {
           double like;
           if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
                               utt, acoustic_scale, determinize, allow_partial,
-                              &alignment_writer, &words_writer,
+                              NULL, &words_writer,
                               &compact_lattice_writer, &lattice_writer,
                               &like)) {
             tot_like += like;

From b29e8ab35433b066d9ea6a6ef77f2e078ba78212 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c06.clsp.jhu.edu>
Date: Tue, 8 May 2018 10:21:56 -0400
Subject: [PATCH 36/93] support n lm

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 2adc9668545..acb74d556ca 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -211,9 +211,13 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
+    int lm_num=(end_lm-start_lm+1)/2;
     std::vector<FasterArpaLm> lm_vec;
     std::vector<FasterArpaLmDeterministicFst> dlm_vec;
     std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    lm_vec.reserve(lm_num);
+    dlm_vec.reserve(lm_num);
+    clm_vec.reserve(lm_num-1);
     for ( int i = start_lm; i < end_lm; i+=2 ) {
       std::string s_lm = po.GetArg(i);
       float w =  atof(po.GetArg(i+1).c_str());
@@ -221,7 +225,7 @@ int main(int argc, char *argv[]) {
       dlm_vec.emplace_back(lm_vec.back());
       if (i == start_lm) continue;
       else if (i == start_lm+2) {
-        clm_vec.emplace_back(&dlm_vec.at(dlm_vec.size()-2),&dlm_vec.back());
+        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
       } else {
         clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
       }

From 3be6798b62acf71b5d89f2bc63a5071b29fa0d1a Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 10:07:30 +0800
Subject: [PATCH 37/93] support ctc by
 /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/debug.sh; TODO: fail to proc <s>

---
 src/bin/latgen-fasterlm-faster-mapped.cc   | 17 +++--
 src/decoder/decodable-matrix.h             | 78 ++++++++++++++++++++++
 src/decoder/lattice-biglm-faster-decoder.h |  2 +-
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index acb74d556ca..c9f8f967dcc 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -160,11 +160,13 @@ int main(int argc, char *argv[]) {
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
     int32 symbol_size = 0;
+    bool ctc = false;
     LatticeBiglmFasterDecoderConfig config;
     config.Register(&po);
 
     ArpaParseOptions arpa_options;
     arpa_options.Register(&po);
+    po.Register("ctc", &ctc, "is ctc decoding");
     po.Register("symbol-size", &symbol_size, "symbol table size");
     po.Register("unk-symbol", &arpa_options.unk_symbol,
                 "Integer corresponds to unknown-word in language model. -1 if "
@@ -203,7 +205,8 @@ int main(int argc, char *argv[]) {
     //new_lm_fst_rxfilename = po.GetArg(4),   
 
     TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
+    if (!ctc)
+        ReadKaldiObject(model_in_filename, &trans_model);
 
     /*
     FasterArpaLm old_lm;
@@ -274,11 +277,17 @@ int main(int argc, char *argv[]) {
             num_fail++;
             continue;
           }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+         
+          DecodableInterface* decodable = NULL;
+          if (!ctc) 
+            decodable = new DecodableMatrixScaledMapped(trans_model, features, acoustic_scale);
+          else {
+            decodable = new DecodableMatrixScaledMappedCtc(features, acoustic_scale);
+            decoder.GetOptions().det_opts.phone_determinize = false; // disable DeterminizeLatticePhonePrunedFirstPass
+          }
 
           double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+          if (DecodeUtterance(decoder, *decodable, trans_model, word_syms,
                               utt, acoustic_scale, determinize, allow_partial,
                               NULL, &words_writer,
                               &compact_lattice_writer, &lattice_writer,
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index de70ea82753..2a4430cf61b 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -83,6 +83,51 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
 };
 
+class DecodableMatrixScaledMappedCtc: public DecodableInterface {
+ public:
+  // This constructor creates an object that will not delete "likes"
+  // when done.
+	DecodableMatrixScaledMappedCtc(const Matrix<BaseFloat> &likes,
+                              BaseFloat scale): likes_(&likes),
+                                                scale_(scale), delete_likes_(false) {
+
+  }
+
+  // This constructor creates an object that will delete "likes"
+  // when done.
+	DecodableMatrixScaledMappedCtc(BaseFloat scale,
+                              const Matrix<BaseFloat> *likes):
+      likes_(likes),
+      scale_(scale), delete_likes_(true) {
+
+  }
+
+  virtual int32 NumFramesReady() const { return likes_->NumRows(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+  // Note, frames are numbered from zero.
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    return scale_ * (*likes_)(frame, tid-1);
+  }
+
+  // Indices are one-based!  This is for compatibility with OpenFst.
+  virtual int32 NumIndices() const { return likes_->NumCols(); }
+
+  virtual ~DecodableMatrixScaledMappedCtc() {
+    if (delete_likes_) delete likes_;
+  }
+ private:
+  const Matrix<BaseFloat> *likes_;
+  BaseFloat scale_;
+  bool delete_likes_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMappedCtc);
+};
+
+
 /**
    This decodable class returns log-likes stored in a matrix; it supports
    repeatedly writing to the matrix and setting a time-offset representing the
@@ -199,6 +244,39 @@ class DecodableMatrixScaled: public DecodableInterface {
 };
 
 
+class DecodableMatrixScaledCtc: public DecodableInterface {
+ public:
+	DecodableMatrixScaledCtc(const Matrix<BaseFloat> &likes,
+                        BaseFloat scale): likes_(likes),
+                                          scale_(scale) { }
+
+  virtual int32 NumFramesReady() const { return likes_.NumRows(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+  // Note, frames are numbered from zero. Here "tid" means token id, the indexes of the
+  // CTC label tokens. When we compile the search graph, the tokens are indexed from 1
+  // because 0 is always occupied by <eps>. However, in the softmax layer of the RNN
+  // model, CTC tokens are indexed from 0. Thus, we simply shift "tid" by 1, to solve
+  // the mismatch.
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    return scale_ * likes_(frame, tid-1);
+  }
+
+  // Indices are one-based!  This is for compatibility with OpenFst.
+  virtual int32 NumIndices() const { return likes_.NumCols(); }
+
+ private:
+  const Matrix<BaseFloat> &likes_;
+  BaseFloat scale_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledCtc);
+};
+
+
+
 }  // namespace kaldi
 
 #endif  // KALDI_DECODER_DECODABLE_MATRIX_H_
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index b13236c2970..75e8af9cf8c 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -66,7 +66,7 @@ class LatticeBiglmFasterDecoder {
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
-  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
+  LatticeBiglmFasterDecoderConfig& GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
     DeleteElems(toks_.Clear());    
     ClearActiveTokens();

From 33f213c67d066564ba97779653a30236335949b4 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 10:42:36 +0800
Subject: [PATCH 38/93] bug fix

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index c9f8f967dcc..e5bc5c8956e 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -296,6 +296,7 @@ int main(int argc, char *argv[]) {
             frame_count += features.NumRows();
             num_success++;
           } else num_fail++;
+          delete decodable;
         }
         elapsed = timer.Elapsed();
       }

From b5185497c91854d1bf972fc4d0d1e24ba49f9fad Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 14:34:34 +0800
Subject: [PATCH 39/93] add comment

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index e5bc5c8956e..49e60851d7e 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -150,11 +150,16 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Generate lattices using on-the-fly composition.\n"
+        "e.g. HCLG_1 - G_1 + (G_2a \\dynamic_int G_2b) \n"
         "User supplies LM used to generate decoding graph, and desired LM;\n"
         "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+        "Usage: latgen-fasterlm-faster-mapped [options] model-in(for ctc, the model is ignored) HCLG-1-fstin "
+        "G-1-oldlm G-1-weight G-2a-newlm G-2a-weight G-2b-newlm G-2b-weight G-2c... features-rspecifier"
+        " lattice-wspecifier  words-wspecifier \n"
+        "Notably, we always make G-1-weight = -1\n"
+        "ctc example: /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/README\n"
+        "hmm example: /fgfs/users/zhc00/works/dyn_dec/kaldi_minilibri/README\n"
+        ;
     ParseOptions po(usage);
     Timer timer;
     bool allow_partial = false;

From 4451da057d6cbd623ca25697a14327c0c4a68768 Mon Sep 17 00:00:00 2001
From: "Jan \"yenda\" Trmal" <jtrmal@gmail.com>
Date: Fri, 30 Mar 2018 13:29:15 -0400
Subject: [PATCH 40/93] [build] Updates to Sequitur installation script (for
 compatibility with new setuptools) (#2323)

---
 tools/extras/install_sequitur.sh | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/tools/extras/install_sequitur.sh b/tools/extras/install_sequitur.sh
index c6d06592408..65f82a14134 100755
--- a/tools/extras/install_sequitur.sh
+++ b/tools/extras/install_sequitur.sh
@@ -62,10 +62,19 @@ if [ ! -d ./sequitur-g2p ] ; then
       echo  >&2 "  manually and re-run the script"
     fi
   }
+else
+  echo >&2 "$0: Updating the repository -- we will try to merge with local changes (if you have any)"
+  (
+    cd sequitur-g2p/
+    git pull
+    # this would work also, but would drop all local modifications
+    #git fetch
+    #git reset --hard origin/master
+  ) || {
+    echo >&2 "Failed to do git pull, delete the sequitur dir and run again";
+    exit 1
+  }
 fi
-#just to retain backward compatibility for a while. Can be removed
-#in a couple of months.
-ln -sf sequitur-g2p sequitur
 
 (
 cd sequitur-g2p
@@ -75,15 +84,21 @@ cd sequitur-g2p
 #the primary issue is that real GNU GCC does not accept that switch
 #in addition, Apple fake g++ based on LLVM version 8.1 prints warning about
 #the libstdc++ should no longer be used.
-if (g++ --version 2>/dev/null | grep -s  "LLVM version 8.0" >/dev/null) ; then 
+if (g++ --version 2>/dev/null | grep -s  "LLVM version 8.0" >/dev/null) ; then
   #Apple fake-g++
   make CXX=g++ CC=gcc CPPFLAGS="-stdlib=libstdc++"
 else
   make CXX=g++ CC=gcc
 fi
 
-python setup.py install --prefix `pwd`
+# the next two lines deal with the issue that the new setup tools
+# expect the directory in which we will be installing to be visible
+# as module directory to python
+site_packages_dir=$(python -m site --user-site | grep -oE "lib.*")
+SEQUITUR=$(pwd)/$site_packages_dir
+PYTHONPATH=${PYTHONPATH:-}:$SEQUITUR python setup.py install --prefix `pwd`
 )
+
 site_packages_dir=$(cd sequitur-g2p; find ./lib{,64} -type d -name site-packages | head -n 1)
 (
   set +u

From 3eb0f0cf8e6799b4b27ff36f3d19f36903acafe4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 2 Apr 2018 22:44:10 -0400
Subject: [PATCH 41/93] [src,egs,doc] Some cosmetic changes

---
 egs/swbd/s5c/local/swbd1_prepare_dict.sh |  1 +
 src/doc/mainpage.dox                     | 10 ++--
 src/ivectorbin/select-voiced-frames.cc   | 17 ++++---
 src/lat/compose-lattice-pruned.cc        | 59 +++++++++++++++---------
 4 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/egs/swbd/s5c/local/swbd1_prepare_dict.sh b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
index f9059b7df41..dadd33e7954 100755
--- a/egs/swbd/s5c/local/swbd1_prepare_dict.sh
+++ b/egs/swbd/s5c/local/swbd1_prepare_dict.sh
@@ -19,6 +19,7 @@ srcdict=$srcdir/swb_ms98_transcriptions/sw-ms98-dict.text
 [ ! -f "$srcdict" ] && echo "$0: No such file $srcdict" && exit 1;
 
 cp $srcdict $dir/lexicon0.txt || exit 1;
+chmod +r $dir/lexicon0.txt  # fix a strange permission in the source.
 patch <local/dict.patch $dir/lexicon0.txt || exit 1;
 
 #(2a) Dictionary preparation:
diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox
index 3b21f6174b0..c6a3468a5d0 100644
--- a/src/doc/mainpage.dox
+++ b/src/doc/mainpage.dox
@@ -30,13 +30,11 @@
 /**
   \mainpage Kaldi
 
-   Now <a href=kaldi.sourceforge.net>kaldi.sourceforge.net</a> is only a backup
-   location.  <a href=http://kaldi-asr.org>kaldi-asr.org/doc</a> is the
-   definitive location of this documentation.  Kaldi's code repository is
-   now located at <a href="http://github.com/kaldi-asr/kaldi">http://github.com/kaldi-asr/kaldi</a>
+  Kaldi is a toolkit for speech recognition, intended for use by speech recognition researchers and
+  professionals.
+
+  Find the code repository at <a href="http://github.com/kaldi-asr/kaldi">http://github.com/kaldi-asr/kaldi</a>.
 
-   See also the top level of <a href=http://kaldi-asr.org>kaldi-asr.org</a>, where
-   you can download pre-built models.
 
   <p>
    - \subpage about
diff --git a/src/ivectorbin/select-voiced-frames.cc b/src/ivectorbin/select-voiced-frames.cc
index beba1a4f068..18e6009de72 100644
--- a/src/ivectorbin/select-voiced-frames.cc
+++ b/src/ivectorbin/select-voiced-frames.cc
@@ -33,11 +33,12 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Select a subset of frames of the input files, based on the output of\n"
         "compute-vad or a similar program (a vector of length num-frames,\n"
-        "containing 1.0 for voiced, 0.0 for unvoiced).\n"
+        "containing 1.0 for voiced, 0.0 for unvoiced).  Caution: this is\n"
+        "mostly useful only in speaker identification applications.\n"
         "Usage: select-voiced-frames [options] <feats-rspecifier> "
         " <vad-rspecifier> <feats-wspecifier>\n"
         "E.g.: select-voiced-frames [options] scp:feats.scp scp:vad.scp ark:-\n";
-    
+
     ParseOptions po(usage);
     po.Read(argc, argv);
 
@@ -45,17 +46,17 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     std::string feat_rspecifier = po.GetArg(1),
         vad_rspecifier = po.GetArg(2),
         feat_wspecifier = po.GetArg(3);
-    
+
     SequentialBaseFloatMatrixReader feat_reader(feat_rspecifier);
     RandomAccessBaseFloatVectorReader vad_reader(vad_rspecifier);
     BaseFloatMatrixWriter feat_writer(feat_wspecifier);
 
     int32 num_done = 0, num_err = 0;
-    
+
     for (;!feat_reader.Done(); feat_reader.Next()) {
       std::string utt = feat_reader.Key();
       const Matrix<BaseFloat> &feat = feat_reader.Value();
@@ -72,8 +73,8 @@ int main(int argc, char *argv[]) {
       const Vector<BaseFloat> &voiced = vad_reader.Value(utt);
 
       if (feat.NumRows() != voiced.Dim()) {
-        KALDI_WARN << "Mismatch in number for frames " << feat.NumRows() 
-                   << " for features and VAD " << voiced.Dim() 
+        KALDI_WARN << "Mismatch in number for frames " << feat.NumRows()
+                   << " for features and VAD " << voiced.Dim()
                    << ", for utterance " << utt;
         num_err++;
         continue;
@@ -111,5 +112,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
-
diff --git a/src/lat/compose-lattice-pruned.cc b/src/lat/compose-lattice-pruned.cc
index f37e72ba1af..46c44d76595 100644
--- a/src/lat/compose-lattice-pruned.cc
+++ b/src/lat/compose-lattice-pruned.cc
@@ -126,7 +126,7 @@ class PrunedCompactLatticeComposer {
 
     // 'arc_delta_costs' is an array, one for each arc (and the final-prob, if
     // present), showing how much the cost to the final-state for the best path
-    // starting in this state and exiting through each arc (or final-state),
+    // starting in this state and exiting through each arc (or final-prob),
     // differs from 'backward_cost'.  Specifically, it contains pairs
     // (delta_cost, arc_index), where delta_cost >= 0 and arc_index is
     // either the index into this state's array of arcs (for arcs), or -1
@@ -201,8 +201,8 @@ class PrunedCompactLatticeComposer {
     //   - If backward_cost is finite (this state in the composed result can
     //    reach the final state via currently expanded states), then
     //    delta_backward_cost is this->backward_cost minus
-    //    lat_state_info_[lat_state].backward_cost.  (It will mostly,
-    //    not always, be >= 0, reflecting that the new LM is better than
+    //    lat_state_info_[lat_state].backward_cost.  (It will mostly, but
+    //    not always, be <= 0, reflecting that the new LM is better than
     //    the old LM).
     //  - On the other hand, if backward_cost is infinite: delta_backward_cost
     //     is set to the delta_backward_cost of the previous state on the best
@@ -216,7 +216,10 @@ class PrunedCompactLatticeComposer {
     //  - For other states, delta_backward_cost will be unchanged since
     //    RecomputePruningInfo() was last called.
     // The above rules may make the delta_backward_cost a less accurate, but
-    // still probably reasonable, heuristic.
+    // still probably reasonable, heuristic.  What it is a heuristic for,
+    // is: if we were to successfully reach an end-state of the composed output
+    // from this state, what would be the resulting backward_cost
+    // minus lat_state_info_[lat_state].backward_cost.
     BaseFloat delta_backward_cost;
 
     // 'prev_composed_state' is the previous state on the best path from
@@ -226,14 +229,14 @@ class PrunedCompactLatticeComposer {
     // used.
     int32 prev_composed_state;
 
-    // 'sorted_arc_index' is an index into the 'arc_delta_costs' array of the
-    // LatticeStateInfo corresponding to 'lat_state'.  It corresponds to the
-    // next arc (or final-prob) out of the input lattice that we have yet to
-    // expand in the composition; or -1 if we have expanded all of them.  When
-    // we first reach a composed state, 'sorted_arc_index' will be zero; then it
-    // will increase one at a time as we expand arcs until either the
-    // composition terminates or we have expanded all the arcs and it becomes
-    // -1.
+    // 'sorted_arc_index' is an index into the 'arc_delta_costs' array which is
+    // a member of the LatticeStateInfo object corresponding to the lattice
+    // state 'lat_state'.  It corresponds to the next arc (or final-prob) out of
+    // the input lattice that we have yet to expand in the composition; or -1 if
+    // we have expanded all of them.  When we first reach a composed state,
+    // 'sorted_arc_index' will be zero; then it will increase one at a time as
+    // we expand arcs until either the composition terminates or we have
+    // expanded all the arcs and it becomes -1.
     int32 sorted_arc_index;
 
     // 'arc_delta_cost' is a derived quantity that we store here for easier
@@ -241,6 +244,11 @@ class PrunedCompactLatticeComposer {
     // if sorted_arc_index >= 0, then:
     //    arc_delta_cost == this_lat_info.arc_delta_costs[sorted_arc_index].first
     // else: arc_delta_cost == +infinity.
+    //
+    // what 'arc_delta_cost' represents (or is a heuristic for), is the expected
+    // cost of a path to the final-state leaving through the arc we're about to
+    // expand, minus the expected cost of any path to the final-state starting
+    // from this state.
     BaseFloat arc_delta_cost;
 
     // view 'expected_cost_offset' a phantom field of this struct, that has
@@ -249,8 +257,11 @@ class PrunedCompactLatticeComposer {
     //
     // 'expected_cost_offset' is a derived quantity that reflects the expected
     // cost (according to our heuristic) of the best path we might encounter
-    // when expanding the next previously unseen arc (or final-prob), corresponding
-    // to 'sorted_arc_index'.
+    // when expanding the next previously unseen arc (or final-prob),
+    // corresponding to 'sorted_arc_index'.  (This is the expected cost of a
+    // successful path, from the beginning to the end of the lattice, but
+    // constrained to be a path that contains the arc we're about to expand).
+    //
     // The 'offset' part is about subtracting the best cost of the lattice, so we
     // can cast to float without too much loss of accuracy:
     //   expected_cost_offset = expected_cost - lat_best_cost_.
@@ -258,7 +269,8 @@ class PrunedCompactLatticeComposer {
     // We define expected_cost_offset by defining the 'expected_cost' part;
     // for clarity:
     //   First, let lat_backward_cost equal the backward_cost of the LatticeStateInfo
-    //   corresponding to 'lat_state'.  Then:
+    //   corresponding to 'lat_state', i.e.
+    //   lat_backward_cost = lat_state_info_[lat_state].backward_cost.  Then:
     //  expected_cost = forward_cost + lat_backward_cost +
     //                  delta_backward_cost + arc_delta_cost.
     // expected_cost_offset will always equal the above minus lat_best_cost_.
@@ -298,7 +310,7 @@ class PrunedCompactLatticeComposer {
 
   // current_cutoff_ is a value used in deciding which composed states
   // need to be included in the queue.  Each time RecomputePruningInfo()
-  // called, it is set to
+  // called, current_cutoff_ is set to
   //    (output_best_cost_ - lat_best_cost_ + opts_.lattice_compose_beam).
   // It will be +infinity if the output lattice doesn't yet have any
   // successful paths.  It decreases with time.  You can compare the
@@ -755,8 +767,9 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
     std::pair<MapType::iterator, bool> ret =
         pair_to_state_.insert(value);
     if (ret.second) {
-      // Successfully inserted.  Most of the rest of this block deals with the
-      // consequences of adding a new state.
+      // Successfully inserted: this dest-state did not already exist.  Most of
+      // the rest of this block deals with the consequences of adding a new
+      // state.
       int32 ans = clat_out_->AddState();
       KALDI_ASSERT(ans == new_composed_state);
       dest_composed_state = new_composed_state;
@@ -777,7 +790,8 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
       dest_info->delta_backward_cost =
           src_info->delta_backward_cost;
       // The 'prev_composed_state' field will not be read again until after it's
-      // overwritten; we set it as below only for debugging purposes.
+      // overwritten; we set it as below only for debugging purposes (the
+      // negation is also for debugging purposes).
       dest_info->prev_composed_state = -src_composed_state;
       dest_info->sorted_arc_index = 0;
       dest_info->arc_delta_cost = 0.0;
@@ -791,8 +805,11 @@ void PrunedCompactLatticeComposer::ProcessTransition(int32 src_composed_state,
            dest_info->delta_backward_cost -
            lat_best_cost_);
       if (expected_cost_offset < current_cutoff_) {
-        composed_state_queue_.push(std::pair<BaseFloat, int32>(
-            expected_cost_offset, dest_composed_state));
+        // the following call should be equivalent to
+        // composed_state_queue_.push(std::pair<BaseFloat,int32>(...)) with
+        // the same pair of args.
+        composed_state_queue_.emplace(expected_cost_offset,
+                                      dest_composed_state);
       }
     } else { // the destination composed state already existed.
       dest_composed_state = ret.first->second;

From 49884ed882639151cbc7eee5bb5951e3ffd81e23 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Tue, 3 Apr 2018 00:53:59 -0400
Subject: [PATCH 42/93] make fst templates inline to eliminate linking errors
 in other places

---
 src/fstext/fstext-utils-inl.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/fstext/fstext-utils-inl.h b/src/fstext/fstext-utils-inl.h
index 923c67c07e2..756e449fcaa 100644
--- a/src/fstext/fstext-utils-inl.h
+++ b/src/fstext/fstext-utils-inl.h
@@ -1132,7 +1132,7 @@ inline bool IsStochasticFst(const Fst<LogArc> &fst,
 
 // Will override this for LogArc where NaturalLess will not work.
 template<class Arc>
-bool IsStochasticFst(const Fst<Arc> &fst,
+inline bool IsStochasticFst(const Fst<Arc> &fst,
                      float delta,
                      typename Arc::Weight *min_sum,
                      typename Arc::Weight *max_sum) {
@@ -1168,7 +1168,7 @@ bool IsStochasticFst(const Fst<Arc> &fst,
 
 // Overriding template for LogArc as NaturalLess does not work there.
 template<>
-bool IsStochasticFst(const Fst<LogArc> &fst,
+inline bool IsStochasticFst(const Fst<LogArc> &fst,
                      float delta,
                      LogArc::Weight *min_sum,
                      LogArc::Weight *max_sum) {
@@ -1208,7 +1208,7 @@ bool IsStochasticFst(const Fst<LogArc> &fst,
 // This function deals with the generic fst.
 // This version currently supports ConstFst<StdArc> or VectorFst<StdArc>.
 // Otherwise, it will be died with an error.
-bool IsStochasticFstInLog(const Fst<StdArc> &fst,
+inline bool IsStochasticFstInLog(const Fst<StdArc> &fst,
                           float delta,
                           StdArc::Weight *min_sum,
                           StdArc::Weight *max_sum) {

From bf8b55b26ea5e4338bbbff71d64a847421d7d253 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Fri, 6 Apr 2018 09:01:10 -0400
Subject: [PATCH 43/93] tmp

---
 src/bin/latgen-biglm-faster-mapped.cc | 278 ++++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 src/bin/latgen-biglm-faster-mapped.cc

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
new file mode 100644
index 00000000000..18a3336540b
--- /dev/null
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -0,0 +1,278 @@
+// bin/latgen-biglm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeBiglmFasterDecoderConfig config;
+    
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    
+    VectorFst<StdArc> *new_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(new_lm_fst_rxfilename));
+
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+    fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From bf365fbba3a4c67b3d68c3b439d19081e0a6f7c0 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c05.clsp.jhu.edu>
Date: Fri, 6 Apr 2018 12:26:29 -0400
Subject: [PATCH 44/93] 
 zchen@c05:/export/a12/zchen/works/decoder/egs/mini_librispeech/s5_otf$ bash
 run.biglm.sh

---
 src/bin/Makefile                      | 1 +
 src/bin/latgen-biglm-faster-mapped.cc | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 627c4f8a131..165eac6bb26 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,6 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
+BINFILES += latgen-biglm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 18a3336540b..10265551a89 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -26,6 +26,7 @@
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
 #include "base/timer.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
 
 
 namespace kaldi {
@@ -240,7 +241,7 @@ int main(int argc, char *argv[]) {
             continue;
           }
                 
-          DecodableMatrixScaledMapped decodable(trans_model, loglikes, acoustic_scale);
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
 
           double like;
           if (DecodeUtterance(decoder, decodable, trans_model, word_syms,

From 57e457e91279217ac629fee6bdb17c6499f43fc4 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 6 Apr 2018 14:18:51 -0700
Subject: [PATCH 45/93] log

---
 src/decoder/lattice-biglm-faster-decoder.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 6276c25a83d..ff337f81083 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -640,6 +640,8 @@ class LatticeBiglmFasterDecoder {
         }
       }
       if (tok_count != NULL) *tok_count = count;
+      KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size() - 1
+                    << " is " << tmp_array_.size();
       if (tmp_array_.size() <= static_cast<size_t>(config_.max_active)) {
         if (adaptive_beam) *adaptive_beam = config_.beam;
         return best_weight + config_.beam;

From c2064a505f054cb1f7d14aef2272054617932ce3 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw220.cm.cluster>
Date: Fri, 6 Apr 2018 16:30:29 -0700
Subject: [PATCH 46/93] prune

---
 src/decoder/lattice-biglm-faster-decoder.h | 69 ++++++++++++++--------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index ff337f81083..39c0658a830 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -630,36 +630,55 @@ class LatticeBiglmFasterDecoder {
       if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
       return best_weight + config_.beam;
     } else {
-      tmp_array_.clear();
-      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-        BaseFloat w = e->val->tot_cost;
-        tmp_array_.push_back(w);
-        if (w < best_weight) {
-          best_weight = w;
-          if (best_elem) *best_elem = e;
-        }
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
       }
-      if (tok_count != NULL) *tok_count = count;
-      KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size() - 1
-                    << " is " << tmp_array_.size();
-      if (tmp_array_.size() <= static_cast<size_t>(config_.max_active)) {
-        if (adaptive_beam) *adaptive_beam = config_.beam;
-        return best_weight + config_.beam;
-      } else {
-        // the lowest elements (lowest costs, highest likes)
-        // will be put in the left part of tmp_array.
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(),
+                       tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0) min_active_cutoff = best_weight;
+      else {
         std::nth_element(tmp_array_.begin(),
-                         tmp_array_.begin()+config_.max_active,
+                         tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
+                         tmp_array_.begin() + config_.max_active :
                          tmp_array_.end());
-        // return the tighter of the two beams.
-        BaseFloat ans = std::min(best_weight + config_.beam,
-                                 *(tmp_array_.begin()+config_.max_active));
-        if (adaptive_beam)
-          *adaptive_beam = std::min(config_.beam,
-                                    ans - best_weight + config_.beam_delta);
-        return ans;
+        min_active_cutoff = tmp_array_[config_.min_active];
       }
     }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+    }
   }
 
   inline StateId PropagateLm(StateId lm_state,

From 751eb76e36b2d6a93134e5fe2797d166f853b39b Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 7 Apr 2018 05:00:55 -0700
Subject: [PATCH 47/93] add profile

---
 src/bin/latgen-biglm-faster-mapped.cc      |  1 +
 src/decoder/lattice-biglm-faster-decoder.h | 36 ++++++++++++----------
 src/decoder/lattice-faster-decoder.cc      |  4 +--
 3 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 10265551a89..e8bc461afe4 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -230,6 +230,7 @@ int main(int argc, char *argv[]) {
 
       {
         LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
     
         for (; !feature_reader.Done(); feature_reader.Next()) {
           std::string utt = feature_reader.Key();
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 39c0658a830..b13236c2970 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -615,21 +615,22 @@ class LatticeBiglmFasterDecoder {
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem) {
-    BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
-    // positive == high cost == bad.
-    size_t count = 0;
-    if (config_.max_active == std::numeric_limits<int32>::max()) {
-      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-        BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
-        if (w < best_weight) {
-          best_weight = w;
-          if (best_elem) *best_elem = e;
-        }
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
       }
-      if (tok_count != NULL) *tok_count = count;
-      if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
-      return best_weight + config_.beam;
-    } else {
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
     tmp_array_.clear();
     for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
       BaseFloat w = e->val->tot_cost;
@@ -678,7 +679,7 @@ class LatticeBiglmFasterDecoder {
       *adaptive_beam = config_.beam;
       return beam_cutoff;
     }
-    }
+  }
   }
 
   inline StateId PropagateLm(StateId lm_state,
@@ -713,7 +714,10 @@ class LatticeBiglmFasterDecoder {
     size_t tok_cnt;
     BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
     PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
-    
+    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
+
+  
     BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
     // pruning "online" before having seen all tokens
 
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 963430a63f1..161f9bf228a 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -699,8 +699,8 @@ BaseFloat LatticeFasterDecoder::ProcessEmitting(DecodableInterface *decodable) {
   BaseFloat adaptive_beam;
   size_t tok_cnt;
   BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
-  KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
-                << adaptive_beam;
+  KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << NumFramesDecoded() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
 
   PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
 

From 8d0be8843c588ebc64fcdcbbfbc7eb8fc991b3dd Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Sat, 7 Apr 2018 19:41:52 -0400
Subject: [PATCH 48/93] tmp

---
 src/bin/latgen-constlm-faster-mapped.cc | 278 ++++++++++++++++++++++++
 1 file changed, 278 insertions(+)
 create mode 100644 src/bin/latgen-constlm-faster-mapped.cc

diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
new file mode 100644
index 00000000000..e986814628b
--- /dev/null
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -0,0 +1,278 @@
+// bin/latgen-constlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "decoder/lattice-constlm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeConstlmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeConstlmFasterDecoderConfig config;
+    
+    std::string word_syms_filename;
+    config.Register(&po);
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    ConstArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+
+    ConstArpaLm new_lm;
+    ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
+    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeConstlmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 77ecf201bf032d88f09265c6da0b476b86431882 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 7 Apr 2018 18:36:55 -0700
Subject: [PATCH 49/93] single det

---
 src/bin/Makefile                        |  2 +-
 src/bin/latgen-biglm-faster-mapped.cc   |  2 +-
 src/bin/latgen-constlm-faster-mapped.cc | 15 ++++++++-------
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 165eac6bb26..5cf8ed63032 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index e8bc461afe4..548dff7533d 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
index e986814628b..caf8dbc5004 100644
--- a/src/bin/latgen-constlm-faster-mapped.cc
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -26,12 +26,13 @@
 #include "decoder/decoder-wrappers.h"
 #include "decoder/decodable-matrix.h"
 #include "base/timer.h"
-#include "decoder/lattice-constlm-faster-decoder.h"
+#include "lm/const-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
 
 
 namespace kaldi {
 // Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeConstlmFasterDecoder &decoder, // not const but is really an input.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
                      DecodableInterface &decodable, // not const but is really an input.
                      const TransitionModel &trans_model,
                      const fst::SymbolTable *word_syms,
@@ -157,7 +158,7 @@ int main(int argc, char *argv[]) {
     Timer timer;
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
-    LatticeConstlmFasterDecoderConfig config;
+    LatticeBiglmFasterDecoderConfig config;
     
     std::string word_syms_filename;
     config.Register(&po);
@@ -185,18 +186,18 @@ int main(int argc, char *argv[]) {
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
+    /*
     ConstArpaLm old_lm;
     ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
     ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
 
     ConstArpaLm new_lm;
     ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
     ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
 
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
@@ -227,7 +228,7 @@ int main(int argc, char *argv[]) {
       Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
 
       {
-        LatticeConstlmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
         timer.Reset();
     
         for (; !feature_reader.Done(); feature_reader.Next()) {

From d9a2880ebd5340a3dae68ac573b4b102112b0072 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c05.clsp.jhu.edu>
Date: Sun, 8 Apr 2018 14:06:30 -0400
Subject: [PATCH 50/93] tmp

---
 src/bin/latgen-biglm-faster-mapped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index 548dff7533d..e8bc461afe4 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;

From 96e2d50ee6e12a3f070b16f0b5cf1c4bde8b7f02 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw227.cm.cluster>
Date: Sun, 8 Apr 2018 12:37:31 -0700
Subject: [PATCH 51/93] otf-res ntok=1

---
 src/fstext/deterministic-fst-inl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index c6f99697e00..3dc49d04ff6 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -190,7 +190,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   Arc arc2;
   if (!fst2_->GetArc(pr.second, arc1.olabel, &arc2)) return false;
   std::pair<const std::pair<StateId, StateId>, StateId> new_value(
-      std::pair<StateId, StateId>(arc1.nextstate, arc2.nextstate),
+      std::pair<StateId, StateId>(arc1.nextstate, arc1.nextstate),
       next_state_);
   std::pair<IterType, bool> result =
       state_map_.insert(new_value);
@@ -199,7 +199,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   oarc->nextstate = result.first->second;
   oarc->weight = Times(arc1.weight, arc2.weight);
   if (result.second == true) { // was inserted
-    next_state_++;
+    //next_state_++;
     const std::pair<StateId, StateId> &new_pair (new_value.first);
     state_vec_.push_back(new_pair);
   }

From c03dfa3978ac4abc55bdccdf8bf87321d95b268f Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw227.cm.cluster>
Date: Sun, 8 Apr 2018 17:41:02 -0700
Subject: [PATCH 52/93] ntok=1

---
 src/decoder/lattice-biglm-faster-decoder.h | 45 ++++++++++++++++++++--
 src/fstext/deterministic-fst-inl.h         |  4 +-
 2 files changed, 44 insertions(+), 5 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index b13236c2970..33bc551d76c 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -64,11 +64,12 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
                  lm_diff_fst->Start() != fst::kNoStateId);
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
   LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
-    DeleteElems(toks_.Clear());    
+    DeleteElems(toks_.Clear());   
     ClearActiveTokens();
   }
 
@@ -87,6 +88,7 @@ class LatticeBiglmFasterDecoder {
     Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
     active_toks_[0].toks = start_tok;
     toks_.Insert(start_pair, start_tok);
+    toks_g1.Insert(PairToState(start_pair), start_pair);
     num_toks_++;
     ProcessNonemitting(0);
     
@@ -298,6 +300,7 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
+  typedef HashList<StateId, PairId>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -305,6 +308,9 @@ class LatticeBiglmFasterDecoder {
     if (new_sz > toks_.Size()) {
       toks_.SetSize(new_sz);
     }
+    if (new_sz > toks_g1.Size()) {
+      toks_g1.SetSize(new_sz);
+    }
   }
 
   // FindOrAddToken either locates a token in hash of toks_,
@@ -312,7 +318,7 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
+  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
@@ -349,7 +355,29 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-  
+   inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Elem_g1 *e_found = toks_g1.Find(PairToState(state_pair));
+    if (e_found == NULL) { // no such token presently.
+      toks_g1.Insert(PairToState(state_pair), state_pair);
+      return FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+    } else {
+      Elem* e_f = toks_.Find(e_found->val);
+      assert(e_f);
+      Token *tok = e_f->val; // There is an existing Token for this state.
+      if (tok->tot_cost > tot_cost) { // replace old token
+        e_found->val = state_pair;
+        tok = FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+      } else {
+        if (changed) *changed = false;
+      }
+      return tok;
+    }
+  }
+ 
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -441,6 +469,7 @@ class LatticeBiglmFasterDecoder {
         best_cost_nofinal = infinity;
     unordered_map<Token*, BaseFloat> tok_to_final_cost;
     Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
     for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
       PairId state_pair = e->key;
       StateId state = PairToState(state_pair),
@@ -709,6 +738,7 @@ class LatticeBiglmFasterDecoder {
   void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
     // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
     Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
     Elem *best_elem = NULL;
     BaseFloat adaptive_beam;
     size_t tok_cnt;
@@ -857,6 +887,7 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
+  HashList<StateId, PairId> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
@@ -886,6 +917,14 @@ class LatticeBiglmFasterDecoder {
       toks_.Delete(e);
     }
     toks_.Clear();
+    DeleteElems_1(toks_g1.Clear());
+  }
+  void DeleteElems_1(Elem_g1 *list) {
+    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_g1.Delete(e);
+    }
+    toks_g1.Clear();
   }
   
   void ClearActiveTokens() { // a cleanup routine, at utt end/begin
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index 3dc49d04ff6..c6f99697e00 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -190,7 +190,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   Arc arc2;
   if (!fst2_->GetArc(pr.second, arc1.olabel, &arc2)) return false;
   std::pair<const std::pair<StateId, StateId>, StateId> new_value(
-      std::pair<StateId, StateId>(arc1.nextstate, arc1.nextstate),
+      std::pair<StateId, StateId>(arc1.nextstate, arc2.nextstate),
       next_state_);
   std::pair<IterType, bool> result =
       state_map_.insert(new_value);
@@ -199,7 +199,7 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   oarc->nextstate = result.first->second;
   oarc->weight = Times(arc1.weight, arc2.weight);
   if (result.second == true) { // was inserted
-    //next_state_++;
+    next_state_++;
     const std::pair<StateId, StateId> &new_pair (new_value.first);
     state_vec_.push_back(new_pair);
   }

From c86c0664289be9b62ee9cdbe07745b5f1db7200c Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 8 Apr 2018 18:47:33 -0700
Subject: [PATCH 53/93] add beam in g1_map

---
 src/decoder/lattice-biglm-faster-decoder.h | 53 ++++++++++++----------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 33bc551d76c..eb640d99937 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -300,7 +300,7 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
-  typedef HashList<StateId, PairId>::Elem Elem_g1;
+  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -355,28 +355,27 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-   inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed) {
+#define res_beam 1
+   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed, bool pp) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
     KALDI_ASSERT(frame < active_toks_.size());
-    Elem_g1 *e_found = toks_g1.Find(PairToState(state_pair));
+    Elem_g1 *e_found = toks_g1.Find(state_id);
     if (e_found == NULL) { // no such token presently.
-      toks_g1.Insert(PairToState(state_pair), state_pair);
-      return FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
+      toks_g1.Insert(state_id, tot_cost);
+      return true;
     } else {
-      Elem* e_f = toks_.Find(e_found->val);
-      assert(e_f);
-      Token *tok = e_f->val; // There is an existing Token for this state.
-      if (tok->tot_cost > tot_cost) { // replace old token
-        e_found->val = state_pair;
-        tok = FindOrAddToken_2(state_pair, frame, tot_cost, emitting, changed);
-      } else {
-        if (changed) *changed = false;
+      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
+        if (tot_cost < e_found->val)
+          e_found->val = tot_cost;
+        return true;
+      }
+      else {
+        return false;
       }
-      return tok;
     }
-  }
+   }
  
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
@@ -712,10 +711,12 @@ class LatticeBiglmFasterDecoder {
   }
 
   inline StateId PropagateLm(StateId lm_state,
-                             Arc *arc) { // returns new LM state.
+                             Arc *arc, bool *pp=NULL) { // returns new LM state.
     if (arc->olabel == 0) {
+      if (pp) *pp=false;
       return lm_state; // no change in LM state if no word crossed.
     } else { // Propagate in the LM-diff FST.
+      if (pp) *pp=false;
       Arc lm_arc;
       bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
       if (!ans) { // this case is unexpected for statistical LMs.
@@ -790,16 +791,18 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            StateId next_lm_state = PropagateLm(lm_state, &arc);
-            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel),
-                graph_cost = arc.weight.Value(),
+            bool pp;
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
+            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
+            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
+            BaseFloat graph_cost = arc.weight.Value(),
                 cur_cost = tok->tot_cost,
                 tot_cost = cur_cost + ac_cost + graph_cost;
             if (tot_cost > next_cutoff) continue;
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
+            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
@@ -861,13 +864,15 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          StateId next_lm_state = PropagateLm(lm_state, &arc);          
+          bool pp;
+          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
+          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
           BaseFloat graph_cost = arc.weight.Value(),
               tot_cost = cur_cost + graph_cost;
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
+            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
                                             false, &changed); // false: non-emit
             
             tok->links = new ForwardLink(new_tok, 0, arc.olabel,
@@ -887,7 +892,7 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
-  HashList<StateId, PairId> toks_g1;
+  HashList<StateId, BaseFloat> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).

From 8fd717efa984f2c0802af86604ae36020cf00471 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 8 Apr 2018 19:09:19 -0700
Subject: [PATCH 54/93] tiny

---
 src/decoder/lattice-biglm-faster-decoder.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index eb640d99937..429a16b2574 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -371,9 +371,12 @@ class LatticeBiglmFasterDecoder {
           e_found->val = tot_cost;
         return true;
       }
-      else {
+      else if (pp) {
         return false;
       }
+      else {
+        return true;
+      }
     }
    }
  
@@ -791,7 +794,7 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            bool pp;
+            bool pp=arc.olabel>0;
             BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
             if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
             StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
@@ -864,7 +867,7 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          bool pp;
+          bool pp=arc.olabel>0;
           if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
           StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
           BaseFloat graph_cost = arc.weight.Value(),

From dedf2b1b64bda3350c63ba48ab685d8f261536ac Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Mon, 9 Apr 2018 12:31:49 -0700
Subject: [PATCH 55/93] tiny

---
 src/bin/latgen-biglm-faster-mapped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
index e8bc461afe4..1f87572a4f3 100644
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ b/src/bin/latgen-biglm-faster-mapped.cc
@@ -198,7 +198,7 @@ int main(int argc, char *argv[]) {
     fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
                                                               &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;

From 4200b7feab1d165c64081e02e35b47bb91e96bed Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 12 Apr 2018 22:28:37 -0400
Subject: [PATCH 56/93] tmp

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 286 ++++++++++++++++++++
 src/lm/faster-arpa-lm.cc                 |  36 +++
 src/lm/faster-arpa-lm.h                  | 324 +++++++++++++++++++++++
 3 files changed, 646 insertions(+)
 create mode 100644 src/bin/latgen-fasterlm-faster-mapped.cc
 create mode 100644 src/lm/faster-arpa-lm.cc
 create mode 100644 src/lm/faster-arpa-lm.h

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
new file mode 100644
index 00000000000..fe6ff62f6eb
--- /dev/null
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -0,0 +1,286 @@
+// bin/latgen-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    LatticeBiglmFasterDecoderConfig config;
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    options.Register(&po);
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename, -1);
+    FasterArpaLmDeterministicFst new_lm_dfst(old_lm);
+
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/lm/faster-arpa-lm.cc b/src/lm/faster-arpa-lm.cc
new file mode 100644
index 00000000000..81d0322ed5b
--- /dev/null
+++ b/src/lm/faster-arpa-lm.cc
@@ -0,0 +1,36 @@
+// lm/const-arpa-lm.cc
+
+// Copyright 2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <limits>
+#include <sstream>
+#include <utility>
+
+#include "base/kaldi-math.h"
+#include "lm/arpa-file-parser.h"
+#include "lm/faster-arpa-lm.h"
+#include "util/stl-utils.h"
+#include "util/text-utils.h"
+
+
+namespace kaldi {
+
+
+
+}  // namespace kaldi
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
new file mode 100644
index 00000000000..a9e6f06ac20
--- /dev/null
+++ b/src/lm/faster-arpa-lm.h
@@ -0,0 +1,324 @@
+// lm/const-arpa-lm.h
+
+// Copyright 2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LM_FASTER_ARPA_LM_H_
+#define KALDI_LM_FASTER_ARPA_LM_H_
+
+#include <string>
+#include <vector>
+
+#include "base/kaldi-common.h"
+#include "fstext/deterministic-fst.h"
+#include "lm/arpa-file-parser.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+#define MAX_NGRAM 5+1
+
+class FasterArpaLm {
+ public:
+
+  // LmState in FasterArpaLm: the basic storage unit
+  class LmState {
+   public:
+    LmState() logprob_(0) { }
+    Allocate(NGram* ngram, float lm_scale=1): 
+    logprob_(ngram->logprob_*lm_scale), 
+    backoff_logprob_(ngram->backoff_logprob_*lm_scale) {
+      /*
+      std::vector<int32> &word_ids = ngram->words;
+      int32 ngram_order = word_ids.size();
+      int32 sz= sizeof(int32)*(ngram_order);
+      */
+    }
+    bool IsExist() { return logprob_!=0; };
+    ~LmState() { }
+
+    // for current query
+    float logprob_;
+    // for next query; can be optional
+    float backoff_logprob_;
+  };
+
+  // Class to build FasterArpaLm from Arpa format language model. It relies on the
+  // auxiliary class LmState above.
+  class FasterArpaLmBuilder : public ArpaFileParser {
+   public:
+    FasterArpaLmBuilder(ArpaParseOptions &options, FasterArpaLm *lm, 
+      float lm_scale = 1): 
+    lm_(lm), lm_scale_(lm_scale) { ArpaFileParser(options, NULL); }
+    ~FasterArpaLmBuilder() { }
+
+   protected:
+    // ArpaFileParser overrides.
+    virtual void HeaderAvailable() {
+      lm_->Allocate(NgramCounts(), Symbols());
+    }
+    virtual void ConsumeNGram(const NGram& ngram) {
+      LmState *lmstate = lm_->GetHashedState(ngram.words);
+      lmstate->Allocate(&ngram, lm_scale_);
+    }
+
+    virtual void ReadComplete()  { }
+
+   private:
+    FasterArpaLm *lm_;
+    float lm_scale_;
+  };
+
+  FasterArpaLm(ArpaParseOptions &options, const std::string& arpa_rxfilename,
+    float lm_scale=0) {
+    is_built_ = false;
+    ngram_order_ = 0;
+    num_words_ = 0;
+    lm_states_size_ = 0;
+    ngrams_ = NULL;
+    randint_per_word_gram_ = NULL;
+    options_ = options;
+
+    BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+  }
+
+  ~FasterArpaLm() {
+    if (is_built_) free();
+  }
+
+  inline LmState* GetHashedState(int32* word_ids, 
+      int query_ngram_order) {
+    assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
+    } else {
+      int32 hashed_idx=randint_per_word_gram_[0][word_ids[0]];
+      for (int i=1; i<ngram_order; i++) {
+        hashed_idx ^= randint_per_word_gram_[i][word_ids[i]];
+      }
+      return &ngrams_[ngram_order-1][hashed_idx & 
+          (ngrams_hashed_size_[ngram_order-1] - 1)];
+    }
+  }
+  inline LmState* GetHashedState(std::vector<int32> &word_ids, 
+      int query_ngram_order = 0) {
+    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
+    int32 word_ids_arr[MAX_NGRAM];
+    for (int i=0; i<query_ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return GetHashedState(word_ids_arr, ngram_order)
+  }
+
+  // if exist, get logprob_, else get backoff_logprob_
+  // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+  inline float GetNgramLogprob(const int32 *word_ids, 
+      const int32 ngram_order, 
+      std::std::vector<int32>& o_word_ids) {
+    float prob;
+    assert(ngram_order > 0);
+    if (ngram_order > ngram_order_) {
+      //while (wseq.size() >= lm_.NgramOrder()) {
+      // History state has at most lm_.NgramOrder() -1 words in the state.
+      // wseq.erase(wseq.begin(), wseq.begin() + 1);
+      //}
+      // we don't need to do above things as we do in reverse fashion:
+      //  memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+      ngram_order = ngram_order_;
+    }
+
+    LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    assert(lm_state);
+    if (lm_state->IsExist()) {
+      prob = lm_state->logprob_;
+      o_word_ids.resize(ngram_order);
+      for (int i=0; i<ngram_order; i++) {
+        o_word_ids[i] = word_ids[i];
+      }
+    } else {
+      LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+      prob = lm_state_bo->backoff_logprob_ + 
+        GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+    }
+    return prob;
+  }
+
+  bool BuildFasterArpaLm(const std::string& arpa_rxfilename, float lm_scale) {
+    FasterArpaLmBuilder lm_builder(options_, this, lm_scale);
+    KALDI_VLOG(1) << "Reading " << arpa_rxfilename;
+    Input ki(arpa_rxfilename);
+    lm_builder.Read(ki.Stream());
+    return true;
+  }
+
+ private:
+  void Allocate(const std::vector<int32>& ngram_count, 
+                const fst::SymbolTable* symbols) {
+    ngram_order_ = ngram_count.size();
+    uint64 max_rand = -1;
+    kaldi::RandomState rstate;
+    rstate.seed = 27437;
+    ngrams_ = malloc(ngram_order_ * sizeof(void*));
+    randint_per_word_gram_ = malloc(ngram_order_ * sizeof(void*));
+    ngrams_hashed_size_ = malloc(ngram_order_ * sizeof(int32));
+    for (int i=0; i< ngram_order_; i++) {
+      if (i == 0) ngrams_hashed_size_[i] = ngram_count[i]; // uni-gram
+      else {
+        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+                                 M_LN2 + 0.3));
+      }
+      KALDI_VLOG(2) << "ngram: "<< i <<" hashed_size/size = "<< 
+        ngrams_hashed_size_[i] / ngram_count[i];
+      ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
+      randint_per_word_gram_[i] = new int32[symbols->NumSymbols()];
+      for (int j=0; j<symbols->NumSymbols(); j++) {
+        randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
+      }
+    }
+    is_built_ = true;
+  }
+  void free() {
+    for (int i=0; i< ngram_order_; i++) {
+      delete ngrams_[i];
+      delete randint_per_word_gram_[i];
+    }
+    delete ngrams_;
+    delete randint_per_word_gram_;
+  }
+
+ private:
+  // configurations
+
+  // Indicating if FasterArpaLm has been built or not.
+  bool is_built_;
+  // N-gram order of language model. This can be figured out from "/data/"
+  // section in Arpa format language model.
+  int32 ngram_order_;
+  // Index of largest word-id plus one. It defines the end of <unigram_states_>
+  // array.
+  int32 num_words_;
+  // Size of the <lm_states_> array, which will be needed by I/O.
+  int64 lm_states_size_;
+  // Hash table from word sequences to LmStates.
+  unordered_map<std::vector<int32>,
+                LmState*, VectorHasher<int32> > seq_to_state_;
+  ArpaParseOptions &options;
+
+  // data
+
+  // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
+  LmState** ngrams_;
+  // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
+  uint64** randint_per_word_gram_;
+  int32* ngrams_hashed_size_;
+};
+
+
+/**
+ This class wraps a FasterArpaLm format language model with the interface defined
+ in DeterministicOnDemandFst.
+ */
+class FasterArpaLmDeterministicFst
+  : public fst::DeterministicOnDemandFst<fst::StdArc> {
+ public:
+  typedef fst::StdArc::Weight Weight;
+  typedef fst::StdArc::StateId StateId;
+  typedef fst::StdArc::Label Label;
+  typedef FasterArpaLm::LmState LmState;
+
+  explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
+    lm_(lm), start_state_(0) { 
+    // Creates a history state for <s>.
+    std::vector<Label> bos_state(1, lm_.BosSymbol());
+    state_to_wseq_.push_back(bos_state);
+    wseq_to_state_[bos_state] = 0;
+  }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual StateId Start() { return start_state_; }
+
+  // We cannot use "const" because the pure virtual function in the interface is
+  // not const.
+  virtual Weight Final(StateId s) {
+    // At this point, we should have created the state.
+    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+    const std::vector<Label>& wseq = state_to_wseq_[s];
+    std::vector<Label> wseq = state_to_wseq_[s];
+    std::vector<Label> owseq;
+    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    return Weight(-logprob);
+  }
+
+  float GetNgramLogprob(std::std::vector<int32> &wseq, int32 ilabel,
+    std::std::vector<int32> &owseq) {
+    int32 n = wseq.size();
+    int32 word_ids[MAX_NGRAM];
+    assert(n+1 <= MAX_NGRAM);
+
+    word_ids[0] = ilabel;
+    for (int i=n-1; i>=0; i-- ) {
+      word_ids[n-i] = wseq[i];
+    }
+
+    return lm_.GetNgramLogprob(word_ids, n+1, owseq);
+  }
+  virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
+    // At this point, we should have created the state.
+    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
+
+    std::vector<Label> wseq = state_to_wseq_[s];
+    std::vector<Label> owseq;
+    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    if (logprob == std::numeric_limits<float>::min()) {
+      return false;
+    }
+
+    std::pair<const std::vector<Label>, StateId> wseq_state_pair(
+        owseq, static_cast<Label>(state_to_wseq_.size()));
+
+    // Attemps to insert the current <wseq_state_pair>. If the pair already exists
+    // then it returns false.
+    typedef MapType::iterator IterType;
+    std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
+
+    // If the pair was just inserted, then also add it to <state_to_wseq_>.
+    if (result.second == true)
+      state_to_wseq_.push_back(owseq);
+
+    // Creates the arc.
+    oarc->ilabel = ilabel;
+    oarc->olabel = ilabel;
+    oarc->nextstate = result.first->second;
+    oarc->weight = Weight(-logprob);
+
+    return true;
+  }
+
+ private:
+  typedef unordered_map<std::vector<Label>,
+                        StateId, VectorHasher<Label> > MapType;
+  StateId start_state_;
+  MapType wseq_to_state_;
+  std::vector<std::vector<Label> > state_to_wseq_;
+
+  const FasterArpaLm& lm_;
+};
+
+
+}  // namespace kaldi
+
+#endif  // KALDI_LM_CONST_ARPA_LM_H_

From 1366efd06261ae0457f422c38de3e79a7924b3ee Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 10:37:39 -0700
Subject: [PATCH 57/93] 1st ver

---
 src/bin/Makefile                         |  2 +-
 src/bin/latgen-fasterlm-faster-mapped.cc | 21 +++++-
 src/lm/faster-arpa-lm.h                  | 95 +++++++++++++++---------
 3 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 5cf8ed63032..9ce73123612 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index fe6ff62f6eb..a30abcd6cbe 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -158,11 +158,23 @@ int main(int argc, char *argv[]) {
     Timer timer;
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0;
     LatticeBiglmFasterDecoderConfig config;
     config.Register(&po);
 
     ArpaParseOptions arpa_options;
-    options.Register(&po);
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
 
     std::string word_syms_filename;
     po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
@@ -177,6 +189,7 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
     
+    KALDI_LOG << RAND_MAX;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
         old_lm_fst_rxfilename = po.GetArg(3),
@@ -196,10 +209,10 @@ int main(int argc, char *argv[]) {
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
 
-    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename, -1);
-    FasterArpaLmDeterministicFst new_lm_dfst(old_lm);
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
 
-    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename);
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
     FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
 
     fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index a9e6f06ac20..c3960e0d590 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -31,24 +31,24 @@
 namespace kaldi {
 
 #define MAX_NGRAM 5+1
-
+#define RAND_TYPE int32
 class FasterArpaLm {
  public:
 
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState() logprob_(0) { }
-    Allocate(NGram* ngram, float lm_scale=1): 
-    logprob_(ngram->logprob_*lm_scale), 
-    backoff_logprob_(ngram->backoff_logprob_*lm_scale) {
+    LmState(): logprob_(0) { }
+    void Allocate(const NGram* ngram, float lm_scale=1) {
+      logprob_ = ngram->logprob*lm_scale;
+      backoff_logprob_ = ngram->backoff*lm_scale;
       /*
       std::vector<int32> &word_ids = ngram->words;
       int32 ngram_order = word_ids.size();
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
-    bool IsExist() { return logprob_!=0; };
+    bool IsExist() { return logprob_!=0; }
     ~LmState() { }
 
     // for current query
@@ -62,17 +62,19 @@ class FasterArpaLm {
   class FasterArpaLmBuilder : public ArpaFileParser {
    public:
     FasterArpaLmBuilder(ArpaParseOptions &options, FasterArpaLm *lm, 
-      float lm_scale = 1): 
-    lm_(lm), lm_scale_(lm_scale) { ArpaFileParser(options, NULL); }
+      float lm_scale = 1): ArpaFileParser(options, NULL),
+    lm_(lm), lm_scale_(lm_scale) { }
     ~FasterArpaLmBuilder() { }
 
    protected:
     // ArpaFileParser overrides.
     virtual void HeaderAvailable() {
-      lm_->Allocate(NgramCounts(), Symbols());
+      lm_->Allocate(NgramCounts(), 
+          Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
       LmState *lmstate = lm_->GetHashedState(ngram.words);
+      assert(lmstate);
       lmstate->Allocate(&ngram, lm_scale_);
     }
 
@@ -84,14 +86,14 @@ class FasterArpaLm {
   };
 
   FasterArpaLm(ArpaParseOptions &options, const std::string& arpa_rxfilename,
-    float lm_scale=0) {
+    int32 symbol_size, float lm_scale=1): symbol_size_(symbol_size), options_(options) {
+    assert(symbol_size_);
     is_built_ = false;
     ngram_order_ = 0;
     num_words_ = 0;
     lm_states_size_ = 0;
     ngrams_ = NULL;
     randint_per_word_gram_ = NULL;
-    options_ = options;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
   }
@@ -100,10 +102,16 @@ class FasterArpaLm {
     if (is_built_) free();
   }
 
-  inline LmState* GetHashedState(int32* word_ids, 
-      int query_ngram_order) {
+  int32 BosSymbol() const { return bos_symbol_; }
+  int32 EosSymbol() const { return eos_symbol_; }
+  int32 UnkSymbol() const { return unk_symbol_; }
+  int32 NgramOrder() const { return ngram_order_; }
+
+  inline LmState* GetHashedState(const int32* word_ids, 
+      int query_ngram_order) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
+    assert(word_ids[ngram_order-1] < ngrams_hashed_size_[ngram_order-1]);
     if (ngram_order == 1) {
       return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
     } else {
@@ -115,20 +123,21 @@ class FasterArpaLm {
           (ngrams_hashed_size_[ngram_order-1] - 1)];
     }
   }
-  inline LmState* GetHashedState(std::vector<int32> &word_ids, 
-      int query_ngram_order = 0) {
+  inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
+      int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
-    for (int i=0; i<query_ngram_order;i++) word_ids_arr[i]=word_ids[i];
-    return GetHashedState(word_ids_arr, ngram_order)
+    for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return GetHashedState(word_ids_arr, ngram_order);
   }
 
   // if exist, get logprob_, else get backoff_logprob_
   // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
   inline float GetNgramLogprob(const int32 *word_ids, 
-      const int32 ngram_order, 
-      std::std::vector<int32>& o_word_ids) {
+      const int32 word_ngram_order, 
+      std::vector<int32>& o_word_ids) const {
     float prob;
+    int32 ngram_order = word_ngram_order;
     assert(ngram_order > 0);
     if (ngram_order > ngram_order_) {
       //while (wseq.size() >= lm_.NgramOrder()) {
@@ -149,6 +158,7 @@ class FasterArpaLm {
         o_word_ids[i] = word_ids[i];
       }
     } else {
+      assert(ngram_order > 1); // thus we can do backoff
       LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
@@ -166,25 +176,29 @@ class FasterArpaLm {
 
  private:
   void Allocate(const std::vector<int32>& ngram_count, 
-                const fst::SymbolTable* symbols) {
+                int32 bos_symbol, int32 eos_symbol, 
+                int32 unk_symbol) {
+    bos_symbol_ = bos_symbol;
+    eos_symbol_ = eos_symbol;
+    unk_symbol_ = unk_symbol;
     ngram_order_ = ngram_count.size();
-    uint64 max_rand = -1;
+    RAND_TYPE max_rand = RAND_MAX;
     kaldi::RandomState rstate;
     rstate.seed = 27437;
-    ngrams_ = malloc(ngram_order_ * sizeof(void*));
-    randint_per_word_gram_ = malloc(ngram_order_ * sizeof(void*));
-    ngrams_hashed_size_ = malloc(ngram_order_ * sizeof(int32));
+    ngrams_ = (LmState**)malloc(ngram_order_ * sizeof(void*));
+    randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
+    ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
     for (int i=0; i< ngram_order_; i++) {
-      if (i == 0) ngrams_hashed_size_[i] = ngram_count[i]; // uni-gram
+      if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.3));
+                                 M_LN2 + 0.5));
       }
-      KALDI_VLOG(2) << "ngram: "<< i <<" hashed_size/size = "<< 
-        ngrams_hashed_size_[i] / ngram_count[i];
+      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
+        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
       ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
-      randint_per_word_gram_[i] = new int32[symbols->NumSymbols()];
-      for (int j=0; j<symbols->NumSymbols(); j++) {
+      randint_per_word_gram_[i] = new RAND_TYPE[symbol_size_];
+      for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
     }
@@ -204,9 +218,17 @@ class FasterArpaLm {
 
   // Indicating if FasterArpaLm has been built or not.
   bool is_built_;
+  // Integer corresponds to <s>.
+  int32 bos_symbol_;
+  // Integer corresponds to </s>.
+  int32 eos_symbol_;
+  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
+  // provided.
+  int32 unk_symbol_;  
   // N-gram order of language model. This can be figured out from "/data/"
   // section in Arpa format language model.
   int32 ngram_order_;
+  int32 symbol_size_;
   // Index of largest word-id plus one. It defines the end of <unigram_states_>
   // array.
   int32 num_words_;
@@ -215,14 +237,14 @@ class FasterArpaLm {
   // Hash table from word sequences to LmStates.
   unordered_map<std::vector<int32>,
                 LmState*, VectorHasher<int32> > seq_to_state_;
-  ArpaParseOptions &options;
+  ArpaParseOptions &options_;
 
   // data
 
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
   LmState** ngrams_;
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
-  uint64** randint_per_word_gram_;
+  RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
 };
 
@@ -240,7 +262,7 @@ class FasterArpaLmDeterministicFst
   typedef FasterArpaLm::LmState LmState;
 
   explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
-    lm_(lm), start_state_(0) { 
+    start_state_(0), lm_(lm) { 
     // Creates a history state for <s>.
     std::vector<Label> bos_state(1, lm_.BosSymbol());
     state_to_wseq_.push_back(bos_state);
@@ -257,14 +279,13 @@ class FasterArpaLmDeterministicFst
     // At this point, we should have created the state.
     KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
     const std::vector<Label>& wseq = state_to_wseq_[s];
-    std::vector<Label> wseq = state_to_wseq_[s];
     std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    float logprob = GetNgramLogprob(wseq, lm_.EosSymbol(), owseq);
     return Weight(-logprob);
   }
 
-  float GetNgramLogprob(std::std::vector<int32> &wseq, int32 ilabel,
-    std::std::vector<int32> &owseq) {
+  float GetNgramLogprob(const std::vector<int32> &wseq, int32 ilabel,
+    std::vector<int32> &owseq) {
     int32 n = wseq.size();
     int32 word_ids[MAX_NGRAM];
     assert(n+1 <= MAX_NGRAM);

From bc92b353b7582aaba3dadf7441c1c8c9e4f9bb30 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 10:52:03 -0700
Subject: [PATCH 58/93] tmp

---
 src/decoder/lattice-biglm-faster-decoder.h | 67 ++++------------------
 1 file changed, 10 insertions(+), 57 deletions(-)

diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 429a16b2574..b13236c2970 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -64,12 +64,11 @@ class LatticeBiglmFasterDecoder {
     KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
                  lm_diff_fst->Start() != fst::kNoStateId);
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
-    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
   LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
-    DeleteElems(toks_.Clear());   
+    DeleteElems(toks_.Clear());    
     ClearActiveTokens();
   }
 
@@ -88,7 +87,6 @@ class LatticeBiglmFasterDecoder {
     Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
     active_toks_[0].toks = start_tok;
     toks_.Insert(start_pair, start_tok);
-    toks_g1.Insert(PairToState(start_pair), start_pair);
     num_toks_++;
     ProcessNonemitting(0);
     
@@ -300,7 +298,6 @@ class LatticeBiglmFasterDecoder {
   };
 
   typedef HashList<PairId, Token*>::Elem Elem;
-  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
   
   void PossiblyResizeHash(size_t num_toks) {
     size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
@@ -308,9 +305,6 @@ class LatticeBiglmFasterDecoder {
     if (new_sz > toks_.Size()) {
       toks_.SetSize(new_sz);
     }
-    if (new_sz > toks_g1.Size()) {
-      toks_g1.SetSize(new_sz);
-    }
   }
 
   // FindOrAddToken either locates a token in hash of toks_,
@@ -318,7 +312,7 @@ class LatticeBiglmFasterDecoder {
   // for the current frame.  [note: it's inserted if necessary into hash toks_
   // and also into the singly linked list of tokens active on this frame
   // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
+  inline Token *FindOrAddToken(PairId state_pair, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
     // if the token was newly created or the cost changed.
@@ -355,31 +349,7 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-#define res_beam 1
-   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed, bool pp) {
-    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
-    // if the token was newly created or the cost changed.
-    KALDI_ASSERT(frame < active_toks_.size());
-    Elem_g1 *e_found = toks_g1.Find(state_id);
-    if (e_found == NULL) { // no such token presently.
-      toks_g1.Insert(state_id, tot_cost);
-      return true;
-    } else {
-      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
-        if (tot_cost < e_found->val)
-          e_found->val = tot_cost;
-        return true;
-      }
-      else if (pp) {
-        return false;
-      }
-      else {
-        return true;
-      }
-    }
-   }
- 
+  
   // prunes outgoing links for all tokens in active_toks_[frame]
   // it's called by PruneActiveTokens
   // all links, that have link_extra_cost > lattice_beam are pruned
@@ -471,7 +441,6 @@ class LatticeBiglmFasterDecoder {
         best_cost_nofinal = infinity;
     unordered_map<Token*, BaseFloat> tok_to_final_cost;
     Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
     for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
       PairId state_pair = e->key;
       StateId state = PairToState(state_pair),
@@ -714,12 +683,10 @@ class LatticeBiglmFasterDecoder {
   }
 
   inline StateId PropagateLm(StateId lm_state,
-                             Arc *arc, bool *pp=NULL) { // returns new LM state.
+                             Arc *arc) { // returns new LM state.
     if (arc->olabel == 0) {
-      if (pp) *pp=false;
       return lm_state; // no change in LM state if no word crossed.
     } else { // Propagate in the LM-diff FST.
-      if (pp) *pp=false;
       Arc lm_arc;
       bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
       if (!ans) { // this case is unexpected for statistical LMs.
@@ -742,7 +709,6 @@ class LatticeBiglmFasterDecoder {
   void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
     // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
     Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
     Elem *best_elem = NULL;
     BaseFloat adaptive_beam;
     size_t tok_cnt;
@@ -794,18 +760,16 @@ class LatticeBiglmFasterDecoder {
           const Arc &arc_ref = aiter.Value();
           if (arc_ref.ilabel != 0) {  // propagate..
             Arc arc(arc_ref);
-            bool pp=arc.olabel>0;
-            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
-            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
-            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
-            BaseFloat graph_cost = arc.weight.Value(),
+            StateId next_lm_state = PropagateLm(lm_state, &arc);
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel),
+                graph_cost = arc.weight.Value(),
                 cur_cost = tok->tot_cost,
                 tot_cost = cur_cost + ac_cost + graph_cost;
             if (tot_cost > next_cutoff) continue;
             else if (tot_cost + config_.beam < next_cutoff)
               next_cutoff = tot_cost + config_.beam; // prune by best current token
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
+            Token *next_tok = FindOrAddToken(next_pair, frame, tot_cost, true, NULL);
             // true: emitting, NULL: no change indicator needed
           
             // Add ForwardLink from tok to next_tok (put on head of list tok->links)
@@ -867,15 +831,13 @@ class LatticeBiglmFasterDecoder {
         const Arc &arc_ref = aiter.Value();
         if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
           Arc arc(arc_ref);
-          bool pp=arc.olabel>0;
-          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
-          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
+          StateId next_lm_state = PropagateLm(lm_state, &arc);          
           BaseFloat graph_cost = arc.weight.Value(),
               tot_cost = cur_cost + graph_cost;
           if (tot_cost < cutoff) {
             bool changed;
             PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
+            Token *new_tok = FindOrAddToken(next_pair, frame, tot_cost,
                                             false, &changed); // false: non-emit
             
             tok->links = new ForwardLink(new_tok, 0, arc.olabel,
@@ -895,7 +857,6 @@ class LatticeBiglmFasterDecoder {
   // more than one list (e.g. for current and previous frames), but only one of
   // them at a time can be indexed by StateId.
   HashList<PairId, Token*> toks_;
-  HashList<StateId, BaseFloat> toks_g1;
   std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
   // frame (members of TokenList are toks, must_prune_forward_links,
   // must_prune_tokens).
@@ -925,14 +886,6 @@ class LatticeBiglmFasterDecoder {
       toks_.Delete(e);
     }
     toks_.Clear();
-    DeleteElems_1(toks_g1.Clear());
-  }
-  void DeleteElems_1(Elem_g1 *list) {
-    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
-      e_tail = e->tail;
-      toks_g1.Delete(e);
-    }
-    toks_g1.Clear();
   }
   
   void ClearActiveTokens() { // a cleanup routine, at utt end/begin

From a22180987c2aea2399725252d22e9a9154874544 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 12:39:44 -0700
Subject: [PATCH 59/93] cmp with latgen-constlm-faster-mapped

---
 src/bin/latgen-constlm-faster-mapped.cc | 15 ++++++++-------
 src/lm/faster-arpa-lm.h                 | 18 +++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
index caf8dbc5004..6c83707e642 100644
--- a/src/bin/latgen-constlm-faster-mapped.cc
+++ b/src/bin/latgen-constlm-faster-mapped.cc
@@ -186,18 +186,19 @@ int main(int argc, char *argv[]) {
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
-    /*
-    ConstArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    ConstArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
 
     ConstArpaLm new_lm;
     ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
     ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
 
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst);
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index c3960e0d590..d139d9d98e7 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -99,7 +99,7 @@ class FasterArpaLm {
   }
 
   ~FasterArpaLm() {
-    if (is_built_) free();
+    if (is_built_) Free();
   }
 
   int32 BosSymbol() const { return bos_symbol_; }
@@ -192,25 +192,25 @@ class FasterArpaLm {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.5));
+                                 M_LN2 + 4));
       }
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
-      ngrams_[i] = new LmState[ngrams_hashed_size_[i]];
-      randint_per_word_gram_[i] = new RAND_TYPE[symbol_size_];
+      ngrams_[i] = (LmState* )calloc(ngrams_hashed_size_[i], sizeof(LmState)) ;
+      randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
     }
     is_built_ = true;
   }
-  void free() {
+  void Free() {
     for (int i=0; i< ngram_order_; i++) {
-      delete ngrams_[i];
-      delete randint_per_word_gram_[i];
+      free(ngrams_[i]);
+      free(randint_per_word_gram_[i]);
     }
-    delete ngrams_;
-    delete randint_per_word_gram_;
+    free(ngrams_);
+    free(randint_per_word_gram_);
   }
 
  private:

From 8391b3f76ddd514a1d5de2b99bd2b1a765a08a17 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 16:36:44 -0700
Subject: [PATCH 60/93] problem: because of hash colid,
 GetHashedState(word_ids, ngram_order) exist, while GetHashedState(word_ids,
 ngram_order-1) doesnt exist; thus cant correctly backoff, when using diff
 old&new lm, we find out this problem; latgen-fasterlm-faster-mapped
 --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3
 --verbose=7 --minimize=false --max-active=7000 --min-active=200 --beam=15.0
 --lattice-beam=8.0 --acoustic-scale=1.0 --allow-partial=true
 --word-symbol-table=exp/chain/tree_sp/graph_tgsmall//words.txt
 exp/chain/tdnn1e_sp/final.mdl data/lang_test_tgsmall//HCLG.fst 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' scp:tmp/feat.scp.1 ark:/dev/null
 ark,t:/dev/null

---
 src/lm/faster-arpa-lm.h | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index d139d9d98e7..505eeeb4aac 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -73,7 +73,7 @@ class FasterArpaLm {
           Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
-      LmState *lmstate = lm_->GetHashedState(ngram.words);
+      LmState *lmstate = lm_->GetHashedState(ngram.words, true);
       assert(lmstate);
       lmstate->Allocate(&ngram, lm_scale_);
     }
@@ -124,10 +124,13 @@ class FasterArpaLm {
     }
   }
   inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
-      int query_ngram_order = 0) const {
+       bool reverse = false, int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
-    for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    if (reverse)
+      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
+    else
+      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
     return GetHashedState(word_ids_arr, ngram_order);
   }
 
@@ -152,14 +155,27 @@ class FasterArpaLm {
     LmState *lm_state = GetHashedState(word_ids, ngram_order);
     assert(lm_state);
     if (lm_state->IsExist()) {
+      assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
-        o_word_ids[i] = word_ids[i];
+        o_word_ids[i] = word_ids[i]; 
+      }
+      if ( word_ids[0] == 82325 && word_ids[1]==84746) {
+        KALDI_LOG<<word_ids[0] <<" "<<ngram_order<<" "<<word_ids[ngram_order-1];
       }
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+#if 1
+      if (!lm_state_bo->IsExist()) {
+        KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
+        for (int i=0; i<ngram_order; i++) {
+          KALDI_WARN << word_ids[i];
+        }
+      }
+      assert(lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
+#endif
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
     }
@@ -291,8 +307,8 @@ class FasterArpaLmDeterministicFst
     assert(n+1 <= MAX_NGRAM);
 
     word_ids[0] = ilabel;
-    for (int i=n-1; i>=0; i-- ) {
-      word_ids[n-i] = wseq[i];
+    for (int i=0; i<n; i++ ) {
+      word_ids[i+1] = wseq[i];
     }
 
     return lm_.GetNgramLogprob(word_ids, n+1, owseq);

From da2aa27a7144793f3e7d91c7374e7aaabb3fa3d4 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Fri, 13 Apr 2018 21:08:45 -0700
Subject: [PATCH 61/93] add linkedlist; fail when no reverse pair; -2.503347   
    4447 8537; seq in wseq is correct

---
 src/lm/faster-arpa-lm.h | 135 ++++++++++++++++++++++++++++++----------
 1 file changed, 101 insertions(+), 34 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 505eeeb4aac..31cb8c0c5e0 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -38,7 +38,10 @@ class FasterArpaLm {
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState(): logprob_(0) { }
+    LmState(): logprob_(0), h_value(0), next(NULL) { }
+    LmState(float logprob, float backoff_logprob): 
+      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
+    next(NULL) { }
     void Allocate(const NGram* ngram, float lm_scale=1) {
       logprob_ = ngram->logprob*lm_scale;
       backoff_logprob_ = ngram->backoff*lm_scale;
@@ -48,13 +51,15 @@ class FasterArpaLm {
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
-    bool IsExist() { return logprob_!=0; }
+    bool IsExist() const { return logprob_!=0; }
     ~LmState() { }
 
     // for current query
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
+    int32 h_value;
+    LmState* next; // for colid
   };
 
   // Class to build FasterArpaLm from Arpa format language model. It relies on the
@@ -73,9 +78,8 @@ class FasterArpaLm {
           Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
     }
     virtual void ConsumeNGram(const NGram& ngram) {
-      LmState *lmstate = lm_->GetHashedState(ngram.words, true);
-      assert(lmstate);
-      lmstate->Allocate(&ngram, lm_scale_);
+      LmState lm_state(ngram.logprob * lm_scale_, ngram.backoff * lm_scale_);
+      lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
     virtual void ReadComplete()  { }
@@ -92,7 +96,6 @@ class FasterArpaLm {
     ngram_order_ = 0;
     num_words_ = 0;
     lm_states_size_ = 0;
-    ngrams_ = NULL;
     randint_per_word_gram_ = NULL;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
@@ -107,23 +110,80 @@ class FasterArpaLm {
   int32 UnkSymbol() const { return unk_symbol_; }
   int32 NgramOrder() const { return ngram_order_; }
 
-  inline LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order) const {
+  inline int32 GetHashedIdx(const int32* word_ids, 
+      int query_ngram_order, int32 *h_value=NULL) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
-    assert(word_ids[ngram_order-1] < ngrams_hashed_size_[ngram_order-1]);
+    int32 hashed_idx;
     if (ngram_order == 1) {
-      return &ngrams_[ngram_order-1][word_ids[ngram_order-1]];
+      hashed_idx = word_ids[ngram_order-1];
     } else {
-      int32 hashed_idx=randint_per_word_gram_[0][word_ids[0]];
-      for (int i=1; i<ngram_order; i++) {
-        hashed_idx ^= randint_per_word_gram_[i][word_ids[i]];
+      hashed_idx=randint_per_word_gram_[0][word_ids[0]];
+      for (int i=1; i<ngram_order_; i++) {
+        int word_id=i<ngram_order?word_ids[i]:0;
+        hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
-      return &ngrams_[ngram_order-1][hashed_idx & 
-          (ngrams_hashed_size_[ngram_order-1] - 1)];
+      if (h_value) *h_value = hashed_idx; // to check colid
+      hashed_idx &= 
+          (hash_size_except_uni_ - 1);
+    }
+    return hashed_idx;
+  }
+  inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
+    if (ngrams_map_.at(hashed_idx)) {
+      LmState *lm_state = ngrams_map_[hashed_idx];
+      while (lm_state->next) lm_state = lm_state->next;
+      lm_state->next = &ngrams_[ngrams_saved_num_];
+    } else {
+      ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
+    }
+  }
+  inline void SaveHashedState(const int32* word_ids, 
+      int query_ngram_order, LmState &lm_state_pattern) {
+    int32 h_value=0;
+    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    lm_state_pattern.h_value = h_value;
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      ngrams_[hashed_idx] = lm_state_pattern;
+    } else {
+      ngrams_[ngrams_saved_num_] = lm_state_pattern;
+      InsertHash(hashed_idx, ngrams_saved_num_++);
     }
   }
-  inline LmState* GetHashedState(const std::vector<int32> &word_ids, 
+  inline void SaveHashedState(const std::vector<int32> &word_ids, LmState &lm_state_pattern,
+       bool reverse = false, int query_ngram_order = 0)  {
+    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
+    int32 word_ids_arr[MAX_NGRAM];
+    if (reverse)
+      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
+    else
+      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+    return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
+  }
+
+
+  inline const LmState* GetHashedState(const int32* word_ids, 
+      int query_ngram_order) const {
+    int32 h_value;
+    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    int32 ngram_order = query_ngram_order;
+    if (ngram_order == 1) {
+      return &ngrams_[hashed_idx];
+    } else {
+      LmState *lm_state = ngrams_map_[hashed_idx];
+      while (lm_state) {
+        if (lm_state->h_value == h_value) {
+          return lm_state;
+        }
+        lm_state = lm_state->next;
+      }
+    }
+   
+    // not found, can be bug or really not found the corresponding ngram 
+    return NULL;
+  }
+  inline const LmState* GetHashedState(const std::vector<int32> &word_ids, 
        bool reverse = false, int query_ngram_order = 0) const {
     int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
     int32 word_ids_arr[MAX_NGRAM];
@@ -152,21 +212,20 @@ class FasterArpaLm {
       ngram_order = ngram_order_;
     }
 
-    LmState *lm_state = GetHashedState(word_ids, ngram_order);
-    assert(lm_state);
-    if (lm_state->IsExist()) {
-      assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
+    const LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    if (lm_state) { //found out
+      assert(lm_state->IsExist());
+      //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
         o_word_ids[i] = word_ids[i]; 
       }
-      if ( word_ids[0] == 82325 && word_ids[1]==84746) {
-        KALDI_LOG<<word_ids[0] <<" "<<ngram_order<<" "<<word_ids[ngram_order-1];
-      }
     } else {
       assert(ngram_order > 1); // thus we can do backoff
-      LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+      const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
+
+      assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
 #if 1
       if (!lm_state_bo->IsExist()) {
         KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
@@ -174,7 +233,6 @@ class FasterArpaLm {
           KALDI_WARN << word_ids[i];
         }
       }
-      assert(lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
 #endif
       prob = lm_state_bo->backoff_logprob_ + 
         GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
@@ -201,32 +259,37 @@ class FasterArpaLm {
     RAND_TYPE max_rand = RAND_MAX;
     kaldi::RandomState rstate;
     rstate.seed = 27437;
-    ngrams_ = (LmState**)malloc(ngram_order_ * sizeof(void*));
     randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
     ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
+    int32 acc=0;
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 4));
+        ngrams_hashed_size_[i] = ngram_count[i];
       }
-      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
-        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
-      ngrams_[i] = (LmState* )calloc(ngrams_hashed_size_[i], sizeof(LmState)) ;
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
       }
+      acc+= ngrams_hashed_size_[i];
     }
+    hash_size_except_uni_ = acc - symbol_size_;
+    hash_size_except_uni_  = (1<<(int)ceil(log(hash_size_except_uni_) / 
+                                 M_LN2 + 0.5));
+    KALDI_VLOG(2) << " hashed_size/size = "<< 
+        1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
+    
+    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructor
+    ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
+    ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
   }
   void Free() {
     for (int i=0; i< ngram_order_; i++) {
-      free(ngrams_[i]);
       free(randint_per_word_gram_[i]);
     }
-    free(ngrams_);
     free(randint_per_word_gram_);
+    free(ngrams_);
   }
 
  private:
@@ -258,10 +321,14 @@ class FasterArpaLm {
   // data
 
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
-  LmState** ngrams_;
+  LmState* ngrams_;
+  int32 ngrams_saved_num_;
+
+  std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
   RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
+  int32 hash_size_except_uni_;
 };
 
 

From b650fcfd82a28ba449fb5d651e1fc9edcf234651 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 05:33:38 -0700
Subject: [PATCH 62/93] tmp

---
 src/lm/faster-arpa-lm.h | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 31cb8c0c5e0..f81320f1c2e 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -22,6 +22,7 @@
 
 #include <string>
 #include <vector>
+#include <math.h>
 
 #include "base/kaldi-common.h"
 #include "fstext/deterministic-fst.h"
@@ -97,8 +98,10 @@ class FasterArpaLm {
     num_words_ = 0;
     lm_states_size_ = 0;
     randint_per_word_gram_ = NULL;
+    max_collision_ = 0;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+    KALDI_VLOG(2) << max_collision_;
   }
 
   ~FasterArpaLm() {
@@ -132,8 +135,13 @@ class FasterArpaLm {
   inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
     if (ngrams_map_.at(hashed_idx)) {
       LmState *lm_state = ngrams_map_[hashed_idx];
-      while (lm_state->next) lm_state = lm_state->next;
+      int32 cnt=0;
+      while (lm_state->next) {
+        lm_state = lm_state->next;
+        cnt++;
+      }
       lm_state->next = &ngrams_[ngrams_saved_num_];
+      max_collision_=std::max(cnt,max_collision_);
     } else {
       ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
     }
@@ -159,6 +167,7 @@ class FasterArpaLm {
       for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
     else
       for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
+
     return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
   }
 
@@ -225,17 +234,10 @@ class FasterArpaLm {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
 
-      assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 
-#if 1
-      if (!lm_state_bo->IsExist()) {
-        KALDI_WARN << ngram_order << "\t" << lm_state_bo->backoff_logprob_;
-        for (int i=0; i<ngram_order; i++) {
-          KALDI_WARN << word_ids[i];
-        }
-      }
-#endif
-      prob = lm_state_bo->backoff_logprob_ + 
-        GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+      //assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 84746 4447 8537 without 4447 8537 in LM
+
+      prob = lm_state_bo? lm_state_bo->backoff_logprob_:0;
+      prob += GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
     }
     return prob;
   }
@@ -329,6 +331,7 @@ class FasterArpaLm {
   RAND_TYPE** randint_per_word_gram_;
   int32* ngrams_hashed_size_;
   int32 hash_size_except_uni_;
+  int32 max_collision_;
 };
 
 

From ade0012024dc9ff0418fc6dbd9ff69d02c266d1b Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw223.cm.cluster>
Date: Sat, 14 Apr 2018 07:57:09 -0700
Subject: [PATCH 63/93] test shows that small & med are with the same score:
 ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007
 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7 
 'fstproject --project_output=true data/lang_test_tgmed/G.fst | fstarcsort
 --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c
 data/local/lm/3-gram.pruned.1e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 2>&1 | tee /tmp/log;
 ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007
 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7 
 'fstproject --project_output=true data/lang_test_tgsmall/G.fst | fstarcsort
 --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c
 data/local/lm/3-gram.pruned.3e-7.arpa.gz| utils/map_arpa_lm.pl
 data/lang_test_tgsmall/words.txt|' 2>&1 | tee /tmp/log;

---
 src/bin/latgen-fasterlm-faster-mapped.cc |   8 +-
 src/lm/Makefile                          |   4 +-
 src/lm/faster-arpa-lm-test.cc            | 122 +++++++++++++++++++++++
 3 files changed, 132 insertions(+), 2 deletions(-)
 create mode 100644 src/lm/faster-arpa-lm-test.cc

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index a30abcd6cbe..637df4896fa 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -208,9 +208,15 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-
+#if 0
     FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+#else
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+#endif
 
     FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
     FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 3dfb409f970..b0b221a795d 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -6,8 +6,10 @@ include ../kaldi.mk
 
 TESTFILES = arpa-file-parser-test arpa-lm-compiler-test
 
+TESTFILES += faster-arpa-lm-test
+
 OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
-	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o  faster-arpa-lm.o 
 
 LIBNAME = kaldi-lm
 
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
new file mode 100644
index 00000000000..e8d327fc975
--- /dev/null
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -0,0 +1,122 @@
+// bin/latgen-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "lm/const-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+//echo 14207 198712 7589 175861 175861 104488 150861 139719 78075 14268 124782 61783 196158 4 20681 194454 137421 158810 161569 4 37434 50498 | awk '{for (i=1;i<=NF;i++)printf $i", "}END{print "\n"NF}'
+// ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7  'fstproject --project_output=true data/lang_test_tgmed/G.fst | fstarcsort --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c data/local/lm/3-gram.pruned.1e-7.arpa.gz| utils/map_arpa_lm.pl data/lang_test_tgsmall/words.txt|'
+//
+namespace kaldi {
+
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+#define  Arc fst::StdArc
+    using fst::ReadFstKaldi;
+
+
+void get_score(fst::CacheDeterministicOnDemandFst<StdArc>* cache_dfst,
+    int* word_ids, int* state_ids, float* scores, int len) {
+  state_ids[0]=cache_dfst->Start();
+  std::cout << "word,state,score: \n";
+  for (int i =0;i<len;i++) {
+  Arc lm_arc;
+  assert(cache_dfst->GetArc(state_ids[i], word_ids[i], &lm_arc));
+  if (i< len-1) state_ids[i+1]=lm_arc.nextstate;
+  scores[i]=lm_arc.weight.Value();
+  std::cout <<word_ids[i]<<","<<state_ids[i]<<","<<scores[i]<<"\n";
+  }
+}
+}
+int main(int argc, char *argv[]) {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+#define  Arc fst::StdArc
+    using fst::ReadFstKaldi;
+
+#define TEST_SIZE 25
+    ParseOptions po("");
+    float scores[TEST_SIZE];
+    float scores2[TEST_SIZE];
+    float scores3[TEST_SIZE];
+    int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    int32 state_ids[TEST_SIZE]={0};
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    int32 symbol_size;
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+    po.Read(argc, argv);
+
+    {
+    std::string g_lm_fst_rxfilename = po.GetArg(1);
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(g_lm_fst_rxfilename));
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&old_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores, TEST_SIZE);
+    }
+   {
+    std::string g_lm_fst_rxfilename = po.GetArg(2);
+    ConstArpaLm new_lm;
+    ReadKaldiObject(g_lm_fst_rxfilename, &new_lm);
+    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores2, TEST_SIZE);
+    }
+   {
+    std::string g_lm_fst_rxfilename = po.GetArg(3);
+    FasterArpaLm new_lm(arpa_options, g_lm_fst_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
+    get_score(&cache_dfst, word_ids, state_ids, scores3, TEST_SIZE);
+   }
+   for (int i=0;i<TEST_SIZE;i++) {
+     if (scores[i]!=scores2[i]) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
+     if (scores[i]!=scores3[i]) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
+   }
+   return 0;
+}

From 82d5efdae6eece4a67b748c01b325a5fe0503310 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 11:10:43 -0700
Subject: [PATCH 64/93] found out it's h_value problem; add a hack to reduce
 colid

---
 src/bin/latgen-fasterlm-faster-mapped.cc |  2 +-
 src/lm/faster-arpa-lm-test.cc            |  6 ++++--
 src/lm/faster-arpa-lm.h                  | 15 ++++++++++++++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 637df4896fa..42b310f3100 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-#if 0
+#if 1
     FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
 #else
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index e8d327fc975..44fd25e8d95 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,12 +67,14 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 25
+#define TEST_SIZE 28
+//#define TEST_SIZE 25
     ParseOptions po("");
     float scores[TEST_SIZE];
     float scores2[TEST_SIZE];
     float scores3[TEST_SIZE];
-    int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
+    int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index f81320f1c2e..fb69360a1fe 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -123,7 +123,8 @@ class FasterArpaLm {
     } else {
       hashed_idx=randint_per_word_gram_[0][word_ids[0]];
       for (int i=1; i<ngram_order_; i++) {
-        int word_id=i<ngram_order?word_ids[i]:0;
+        int word_id=i<ngram_order?word_ids[i]:
+          (word_ids[i-ngram_order] + i + ngram_order); // this is totally a hack
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
       if (h_value) *h_value = hashed_idx; // to check colid
@@ -226,10 +227,22 @@ class FasterArpaLm {
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
+      /*
+      for (int i=0; i<ngram_order; i++) {
+        std::cout<<word_ids[i]<<" ";
+      }
+      std::cout<<ngram_order<<" "<<prob<<"\n";
+      */
+      // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
+      ngram_order = std::min(ngram_order,ngram_order_-1);
+      while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
+      assert(ngram_order>0);
+
       o_word_ids.resize(ngram_order);
       for (int i=0; i<ngram_order; i++) {
         o_word_ids[i] = word_ids[i]; 
       }
+
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 

From 0f54c00e50e9de2d9e734b76c688454006c73c99 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 12:10:05 -0700
Subject: [PATCH 65/93] separate ngrams_map; use uint64 rand()

---
 src/bin/latgen-fasterlm-faster-mapped.cc |  1 -
 src/lm/faster-arpa-lm-test.cc            | 10 +++--
 src/lm/faster-arpa-lm.h                  | 48 +++++++++++++++---------
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 42b310f3100..be2fce3e01d 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -189,7 +189,6 @@ int main(int argc, char *argv[]) {
       exit(1);
     }
     
-    KALDI_LOG << RAND_MAX;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
         old_lm_fst_rxfilename = po.GetArg(3),
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index 44fd25e8d95..b542c662988 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,14 +67,16 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 28
+#define TEST_SIZE 26
+//#define TEST_SIZE 28
 //#define TEST_SIZE 25
     ParseOptions po("");
     float scores[TEST_SIZE];
     float scores2[TEST_SIZE];
     float scores3[TEST_SIZE];
     //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
-    int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
+    //int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
+    int32 word_ids[] = {14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 124782, 19206, 53865, 137753, 2279, 32505, 153074, 4, 37434, 50498};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
@@ -117,8 +119,8 @@ int main(int argc, char *argv[]) {
     get_score(&cache_dfst, word_ids, state_ids, scores3, TEST_SIZE);
    }
    for (int i=0;i<TEST_SIZE;i++) {
-     if (scores[i]!=scores2[i]) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
-     if (scores[i]!=scores3[i]) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
+     if (abs(scores[i]-scores2[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
+     if (abs(scores[i]-scores3[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
    }
    return 0;
 }
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index fb69360a1fe..13d7d0e1b53 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -31,8 +31,16 @@
 
 namespace kaldi {
 
+uint64  RandInt64() {
+  uint64_t random =
+  (((uint64_t) rand() <<  0) & 0x000000000000FFFFull) ^ 
+  (((uint64_t) rand() << 16) & 0x00000000FFFF0000ull) ^ 
+  (((uint64_t) rand() << 32) & 0x0000FFFF00000000ull) ^
+  (((uint64_t) rand() << 48) & 0xFFFF000000000000ull);
+  return random;
+}
 #define MAX_NGRAM 5+1
-#define RAND_TYPE int32
+#define RAND_TYPE int64
 class FasterArpaLm {
  public:
 
@@ -122,14 +130,15 @@ class FasterArpaLm {
       hashed_idx = word_ids[ngram_order-1];
     } else {
       hashed_idx=randint_per_word_gram_[0][word_ids[0]];
-      for (int i=1; i<ngram_order_; i++) {
-        int word_id=i<ngram_order?word_ids[i]:
-          (word_ids[i-ngram_order] + i + ngram_order); // this is totally a hack
+      for (int i=1; i<ngram_order; i++) {
+        int word_id=word_ids[i];
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
       if (h_value) *h_value = hashed_idx; // to check colid
+      int i = ngram_order-1;
       hashed_idx &= 
-          (hash_size_except_uni_ - 1);
+          (ngrams_hashed_size_[i]-ngrams_hashed_size_[i-1] - 1);
+      hashed_idx += ngrams_hashed_size_[i-1];
     }
     return hashed_idx;
   }
@@ -227,12 +236,13 @@ class FasterArpaLm {
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
-      /*
+     
+/* 
       for (int i=0; i<ngram_order; i++) {
         std::cout<<word_ids[i]<<" ";
       }
       std::cout<<ngram_order<<" "<<prob<<"\n";
-      */
+  */    
       // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
       ngram_order = std::min(ngram_order,ngram_order_-1);
       while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
@@ -271,26 +281,30 @@ class FasterArpaLm {
     eos_symbol_ = eos_symbol;
     unk_symbol_ = unk_symbol;
     ngram_order_ = ngram_count.size();
-    RAND_TYPE max_rand = RAND_MAX;
-    kaldi::RandomState rstate;
-    rstate.seed = 27437;
+    srand(0);
     randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
     ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
     int32 acc=0;
+    int32 acc_hashed=0;
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = ngram_count[i];
+        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+                                 M_LN2 + 0.5));
       }
+      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
+        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
-        randint_per_word_gram_[i][j] = kaldi::RandInt(0, max_rand, &rstate);
+        randint_per_word_gram_[i][j] = RandInt64(); 
       }
-      acc+= ngrams_hashed_size_[i];
+      acc+= ngram_count[i];
+      acc_hashed+= ngrams_hashed_size_[i];
+      if (i==0) ngrams_hashed_size_[i]=0;
+      else ngrams_hashed_size_[i]+=ngrams_hashed_size_[i-1];
     }
-    hash_size_except_uni_ = acc - symbol_size_;
-    hash_size_except_uni_  = (1<<(int)ceil(log(hash_size_except_uni_) / 
-                                 M_LN2 + 0.5));
+    hash_size_except_uni_ = acc_hashed - symbol_size_;
+    assert(ngrams_hashed_size_[ngram_order_-1]==hash_size_except_uni_);
     KALDI_VLOG(2) << " hashed_size/size = "<< 
         1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
     
@@ -342,7 +356,7 @@ class FasterArpaLm {
   std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
   RAND_TYPE** randint_per_word_gram_;
-  int32* ngrams_hashed_size_;
+  int32* ngrams_hashed_size_; //after init, it's an accumulate value
   int32 hash_size_except_uni_;
   int32 max_collision_;
 };

From ba12ab2497a507c2273409d8e013414b4b8d782d Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 12:30:19 -0700
Subject: [PATCH 66/93] match performance in exp_dec/constlm.1a/dec.log; but
 still larger toks

---
 src/lm/faster-arpa-lm-test.cc |  4 ++--
 src/lm/faster-arpa-lm.h       | 12 ++++++------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
index b542c662988..9a72289be23 100644
--- a/src/lm/faster-arpa-lm-test.cc
+++ b/src/lm/faster-arpa-lm-test.cc
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]) {
 #define  Arc fst::StdArc
     using fst::ReadFstKaldi;
 
-#define TEST_SIZE 26
+#define TEST_SIZE 39
 //#define TEST_SIZE 28
 //#define TEST_SIZE 25
     ParseOptions po("");
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
     float scores3[TEST_SIZE];
     //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
     //int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
-    int32 word_ids[] = {14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 124782, 19206, 53865, 137753, 2279, 32505, 153074, 4, 37434, 50498};
+    int32 word_ids[] = {78521, 148206, 178313, 175861, 144826, 28459, 25372, 62655, 138328, 175861, 72352, 76155, 152997, 4, 102911, 177031, 193231, 127711, 71590, 47932, 151710, 40606, 5411, 82074, 86219, 81505, 77097, 4, 155384, 194419, 193822, 71589, 76098, 163928, 124918, 177084, 9376, 81505, 78840};
     int32 state_ids[TEST_SIZE]={0};
 
     ArpaParseOptions arpa_options;
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 13d7d0e1b53..fab511c0ea5 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -67,7 +67,7 @@ class FasterArpaLm {
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
-    int32 h_value;
+    RAND_TYPE h_value;
     LmState* next; // for colid
   };
 
@@ -122,10 +122,10 @@ class FasterArpaLm {
   int32 NgramOrder() const { return ngram_order_; }
 
   inline int32 GetHashedIdx(const int32* word_ids, 
-      int query_ngram_order, int32 *h_value=NULL) const {
+      int query_ngram_order, RAND_TYPE *h_value=NULL) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
-    int32 hashed_idx;
+    RAND_TYPE hashed_idx;
     if (ngram_order == 1) {
       hashed_idx = word_ids[ngram_order-1];
     } else {
@@ -134,7 +134,7 @@ class FasterArpaLm {
         int word_id=word_ids[i];
         hashed_idx ^= randint_per_word_gram_[i][word_id];
       }
-      if (h_value) *h_value = hashed_idx; // to check colid
+      if (h_value) *h_value = hashed_idx; // to check colid, h_value should be precise
       int i = ngram_order-1;
       hashed_idx &= 
           (ngrams_hashed_size_[i]-ngrams_hashed_size_[i-1] - 1);
@@ -158,7 +158,7 @@ class FasterArpaLm {
   }
   inline void SaveHashedState(const int32* word_ids, 
       int query_ngram_order, LmState &lm_state_pattern) {
-    int32 h_value=0;
+    RAND_TYPE h_value=0;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     lm_state_pattern.h_value = h_value;
     int32 ngram_order = query_ngram_order;
@@ -184,7 +184,7 @@ class FasterArpaLm {
 
   inline const LmState* GetHashedState(const int32* word_ids, 
       int query_ngram_order) const {
-    int32 h_value;
+    RAND_TYPE h_value;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {

From 6c2ac94278d47c1f3aa2a58a947fe37ca117eff8 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 13:00:27 -0700
Subject: [PATCH 67/93] add otfres;

---
 src/bin/Makefile                              |   2 +-
 .../latgen-otfres-fasterlm-faster-mapped.cc   | 304 ++++++
 .../lattice-otfres-biglm-faster-decoder.h     | 957 ++++++++++++++++++
 3 files changed, 1262 insertions(+), 1 deletion(-)
 create mode 100644 src/bin/latgen-otfres-fasterlm-faster-mapped.cc
 create mode 100644 src/decoder/lattice-otfres-biglm-faster-decoder.h

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 9ce73123612..439353b06eb 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
new file mode 100644
index 00000000000..ad475f9405f
--- /dev/null
+++ b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
@@ -0,0 +1,304 @@
+// bin/latgen-otfres-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-otfres-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    if (alignment_writer->IsOpen())
+      alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0;
+    LatticeBiglmFasterDecoderConfig config;
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+      po.PrintUsage();
+      exit(1);
+    }
+    
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        old_lm_fst_rxfilename = po.GetArg(3),
+        new_lm_fst_rxfilename = po.GetArg(4),
+        feature_rspecifier = po.GetArg(5),
+        lattice_wspecifier = po.GetArg(6),
+        words_wspecifier = po.GetOptArg(7),
+        alignment_wspecifier = po.GetOptArg(8);
+    
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+#if 1
+    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+#else
+    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
+        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
+    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
+    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
+#endif
+
+    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
+
+    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
+                                                              &new_lm_dfst);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              &alignment_writer, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+      }
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/decoder/lattice-otfres-biglm-faster-decoder.h b/src/decoder/lattice-otfres-biglm-faster-decoder.h
new file mode 100644
index 00000000000..841547b9cca
--- /dev/null
+++ b/src/decoder/lattice-otfres-biglm-faster-decoder.h
@@ -0,0 +1,957 @@
+// decoder/lattice-otfres-biglm-faster-decoder.h
+
+// Copyright 2009-2011  Microsoft Corporation, Mirko Hannemann,
+//              Gilles Boulianne
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
+#define KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
+
+
+#include "util/stl-utils.h"
+#include "util/hash-list.h"
+#include "fst/fstlib.h"
+#include "itf/decodable-itf.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "decoder/lattice-faster-decoder.h" // for options.
+
+
+namespace kaldi {
+
+// The options are the same as for lattice-faster-decoder.h for now.
+typedef LatticeFasterDecoderConfig LatticeBiglmFasterDecoderConfig;
+
+/** This is as LatticeFasterDecoder, but does online composition between
+    HCLG and the "difference language model", which is a deterministic
+    FST that represents the difference between the language model you want
+    and the language model you compiled HCLG with.  The class
+    DeterministicOnDemandFst follows through the epsilons in G for you
+    (assuming G is a standard backoff language model) and makes it look
+    like a determinized FST.
+*/
+
+class LatticeBiglmFasterDecoder {
+ public:
+  typedef fst::StdArc Arc;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+  // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
+  typedef uint64 PairId;
+  typedef Arc::Weight Weight;
+  // instantiate this class once for each thing you have to decode.
+  LatticeBiglmFasterDecoder(
+      const fst::Fst<fst::StdArc> &fst,      
+      const LatticeBiglmFasterDecoderConfig &config,
+      fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst):
+      fst_(fst), lm_diff_fst_(lm_diff_fst), config_(config),
+      warned_noarc_(false), num_toks_(0) {
+    config.Check();
+    KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
+                 lm_diff_fst->Start() != fst::kNoStateId);
+    toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
+  }
+  void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
+  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
+  ~LatticeBiglmFasterDecoder() {
+    DeleteElems(toks_.Clear());   
+    ClearActiveTokens();
+  }
+
+  // Returns true if any kind of traceback is available (not necessarily from
+  // a final state).
+  bool Decode(DecodableInterface *decodable) {
+    // clean up from last time:
+    DeleteElems(toks_.Clear());
+    ClearActiveTokens();
+    warned_ = false;
+    final_active_ = false;
+    final_costs_.clear();
+    num_toks_ = 0;
+    PairId start_pair = ConstructPair(fst_.Start(), lm_diff_fst_->Start());
+    active_toks_.resize(1);
+    Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
+    active_toks_[0].toks = start_tok;
+    toks_.Insert(start_pair, start_tok);
+    toks_g1.Insert(PairToState(start_pair), start_pair);
+    num_toks_++;
+    ProcessNonemitting(0);
+    
+    // We use 1-based indexing for frames in this decoder (if you view it in
+    // terms of features), but note that the decodable object uses zero-based
+    // numbering, which we have to correct for when we call it.
+    for (int32 frame = 1; !decodable->IsLastFrame(frame-2); frame++) {
+      active_toks_.resize(frame+1); // new column
+
+      ProcessEmitting(decodable, frame);
+      
+      ProcessNonemitting(frame);
+
+      if (decodable->IsLastFrame(frame-1))
+        PruneActiveTokensFinal(frame);
+      else if (frame % config_.prune_interval == 0)
+        PruneActiveTokens(frame, config_.lattice_beam * 0.1); // use larger delta.        
+    }
+    // Returns true if we have any kind of traceback available (not necessarily
+    // to the end state; query ReachedFinal() for that).
+    return !final_costs_.empty();
+  }
+
+  /// says whether a final-state was active on the last frame.  If it was not, the
+  /// lattice (or traceback) will end with states that are not final-states.
+  bool ReachedFinal() const { return final_active_; }
+
+
+  // Outputs an FST corresponding to the single best path
+  // through the lattice.
+  bool GetBestPath(fst::MutableFst<LatticeArc> *ofst, 
+                   bool use_final_probs = true) const {
+    fst::VectorFst<LatticeArc> fst;
+    if (!GetRawLattice(&fst, use_final_probs)) return false;
+    // std::cout << "Raw lattice is:\n";
+    // fst::FstPrinter<LatticeArc> fstprinter(fst, NULL, NULL, NULL, false, true);
+    // fstprinter.Print(&std::cout, "standard output");
+    ShortestPath(fst, ofst);
+    return true;
+  }
+
+  // Outputs an FST corresponding to the raw, state-level
+  // tracebacks.
+  bool GetRawLattice(fst::MutableFst<LatticeArc> *ofst,
+                     bool use_final_probs = true) const {
+    typedef LatticeArc Arc;
+    typedef Arc::StateId StateId;
+    // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
+    typedef uint64 PairId;
+    typedef Arc::Weight Weight;
+    typedef Arc::Label Label;
+    ofst->DeleteStates();
+    // num-frames plus one (since frames are one-based, and we have
+    // an extra frame for the start-state).
+    int32 num_frames = active_toks_.size() - 1;
+    KALDI_ASSERT(num_frames > 0);
+    unordered_map<Token*, StateId> tok_map(num_toks_/2 + 3); // bucket count
+    // First create all states.
+    for (int32 f = 0; f <= num_frames; f++) {
+      if (active_toks_[f].toks == NULL) {
+        KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
+                   << ": not producing lattice.\n";
+        return false;
+      }
+      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next)
+        tok_map[tok] = ofst->AddState();
+      // The next statement sets the start state of the output FST.
+      // Because we always add new states to the head of the list
+      // active_toks_[f].toks, and the start state was the first one
+      // added, it will be the last one added to ofst.
+      if (f == 0 && ofst->NumStates() > 0)
+        ofst->SetStart(ofst->NumStates()-1);
+    }
+    KALDI_VLOG(3) << "init:" << num_toks_/2 + 3 << " buckets:" 
+                  << tok_map.bucket_count() << " load:" << tok_map.load_factor() 
+                  << " max:" << tok_map.max_load_factor();
+    // Now create all arcs.
+    StateId cur_state = 0; // we rely on the fact that we numbered these
+    // consecutively (AddState() returns the numbers in order..)
+    for (int32 f = 0; f <= num_frames; f++) {
+      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next,
+               cur_state++) {
+        for (ForwardLink *l = tok->links;
+             l != NULL;
+             l = l->next) {
+          unordered_map<Token*, StateId>::const_iterator iter =
+              tok_map.find(l->next_tok);
+          StateId nextstate = iter->second;
+          KALDI_ASSERT(iter != tok_map.end());
+          Arc arc(l->ilabel, l->olabel,
+                  Weight(l->graph_cost, l->acoustic_cost),
+                  nextstate);
+          ofst->AddArc(cur_state, arc);
+        }
+        if (f == num_frames) {
+          if (use_final_probs && !final_costs_.empty()) {
+            std::map<Token*, BaseFloat>::const_iterator iter =
+                final_costs_.find(tok);
+            if (iter != final_costs_.end())
+              ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
+          } else {
+            ofst->SetFinal(cur_state, LatticeWeight::One());
+          }
+        }
+      }
+    }
+    KALDI_ASSERT(cur_state == ofst->NumStates());
+    return (cur_state != 0);
+  }
+
+  // This function is now deprecated, since now we do determinization from
+  // outside the LatticeBiglmFasterDecoder class.
+  // Outputs an FST corresponding to the lattice-determinized
+  // lattice (one path per word sequence).
+  bool GetLattice(fst::MutableFst<CompactLatticeArc> *ofst,
+                  bool use_final_probs = true) const {
+    Lattice raw_fst;
+    if (!GetRawLattice(&raw_fst, use_final_probs)) return false;
+    Invert(&raw_fst); // make it so word labels are on the input.
+    if (!TopSort(&raw_fst)) // topological sort makes lattice-determinization more efficient
+      KALDI_WARN << "Topological sorting of state-level lattice failed "
+          "(probably your lexicon has empty words or your LM has epsilon cycles; this "
+          " is a bad idea.)";
+    // (in phase where we get backward-costs).
+    fst::ILabelCompare<LatticeArc> ilabel_comp;
+    ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
+    // lattice-determinization more efficient.
+    
+    fst::DeterminizeLatticePrunedOptions lat_opts;
+    lat_opts.max_mem = config_.det_opts.max_mem;
+    
+    DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
+    raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
+    Connect(ofst); // Remove unreachable states... there might be
+    // a small number of these, in some cases.
+    return true;
+  }
+  
+ private:
+  inline PairId ConstructPair(StateId fst_state, StateId lm_state) {
+    return static_cast<PairId>(fst_state) + (static_cast<PairId>(lm_state) << 32);
+  }
+  
+  static inline StateId PairToState(PairId state_pair) {
+    return static_cast<StateId>(static_cast<uint32>(state_pair));
+  }
+  static inline StateId PairToLmState(PairId state_pair) {
+    return static_cast<StateId>(static_cast<uint32>(state_pair >> 32));
+  }
+  
+  struct Token;
+  // ForwardLinks are the links from a token to a token on the next frame.
+  // or sometimes on the current frame (for input-epsilon links).
+  struct ForwardLink {
+    Token *next_tok; // the next token [or NULL if represents final-state]
+    Label ilabel; // ilabel on link.
+    Label olabel; // olabel on link.
+    BaseFloat graph_cost; // graph cost of traversing link (contains LM, etc.)
+    BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing link
+    ForwardLink *next; // next in singly-linked list of forward links from a
+                       // token.
+    inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
+                       BaseFloat graph_cost, BaseFloat acoustic_cost, 
+                       ForwardLink *next):
+        next_tok(next_tok), ilabel(ilabel), olabel(olabel),
+        graph_cost(graph_cost), acoustic_cost(acoustic_cost), 
+        next(next) { }
+  };  
+  
+  // Token is what's resident in a particular state at a particular time.
+  // In this decoder a Token actually contains *forward* links.
+  // When first created, a Token just has the (total) cost.    We add forward
+  // links to it when we process the next frame.
+  struct Token {
+    BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
+    BaseFloat extra_cost; // >= 0.  After calling PruneForwardLinks, this equals
+    // the minimum difference between the cost of the best path, and the cost of
+    // this is on, and the cost of the absolute best path, under the assumption
+    // that any of the currently active states at the decoding front may
+    // eventually succeed (e.g. if you were to take the currently active states
+    // one by one and compute this difference, and then take the minimum).
+    
+    ForwardLink *links; // Head of singly linked list of ForwardLinks
+    
+    Token *next; // Next in list of tokens for this frame.
+    
+    inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
+                 Token *next): tot_cost(tot_cost), extra_cost(extra_cost),
+                 links(links), next(next) { }
+    inline void DeleteForwardLinks() {
+      ForwardLink *l = links, *m; 
+      while (l != NULL) {
+        m = l->next;
+        delete l;
+        l = m;
+      }
+      links = NULL;
+    }
+  };
+  
+  // head and tail of per-frame list of Tokens (list is in topological order),
+  // and something saying whether we ever pruned it using PruneForwardLinks.
+  struct TokenList {
+    Token *toks;
+    bool must_prune_forward_links;
+    bool must_prune_tokens;
+    TokenList(): toks(NULL), must_prune_forward_links(true),
+                 must_prune_tokens(true) { }
+  };
+
+  typedef HashList<PairId, Token*>::Elem Elem;
+  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
+  
+  void PossiblyResizeHash(size_t num_toks) {
+    size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
+                                        * config_.hash_ratio);
+    if (new_sz > toks_.Size()) {
+      toks_.SetSize(new_sz);
+    }
+    if (new_sz > toks_g1.Size()) {
+      toks_g1.SetSize(new_sz);
+    }
+  }
+
+  // FindOrAddToken either locates a token in hash of toks_,
+  // or if necessary inserts a new, empty token (i.e. with no forward links)
+  // for the current frame.  [note: it's inserted if necessary into hash toks_
+  // and also into the singly linked list of tokens active on this frame
+  // (whose head is at active_toks_[frame]).
+  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Token *&toks = active_toks_[frame].toks;
+    Elem *e_found = toks_.Find(state_pair);
+    if (e_found == NULL) { // no such token presently.
+      const BaseFloat extra_cost = 0.0;
+      // tokens on the currently final frame have zero extra_cost
+      // as any of them could end up
+      // on the winning path.
+      Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks);
+      // NULL: no forward links yet
+      toks = new_tok;
+      num_toks_++;
+      toks_.Insert(state_pair, new_tok);
+      if (changed) *changed = true;
+      return new_tok;
+    } else {
+      Token *tok = e_found->val; // There is an existing Token for this state.
+      if (tok->tot_cost > tot_cost) { // replace old token
+        tok->tot_cost = tot_cost;
+        // we don't allocate a new token, the old stays linked in active_toks_
+        // we only replace the tot_cost
+        // in the current frame, there are no forward links (and no extra_cost)
+        // only in ProcessNonemitting we have to delete forward links
+        // in case we visit a state for the second time
+        // those forward links, that lead to this replaced token before:
+        // they remain and will hopefully be pruned later (PruneForwardLinks...)
+        if (changed) *changed = true;
+      } else {
+        if (changed) *changed = false;
+      }
+      return tok;
+    }
+  }
+#define res_beam 1
+   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
+                               bool emitting, bool *changed, bool pp) {
+    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+    // if the token was newly created or the cost changed.
+    KALDI_ASSERT(frame < active_toks_.size());
+    Elem_g1 *e_found = toks_g1.Find(state_id);
+    if (e_found == NULL) { // no such token presently.
+      toks_g1.Insert(state_id, tot_cost);
+      return true;
+    } else {
+      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
+        if (tot_cost < e_found->val)
+          e_found->val = tot_cost;
+        return true;
+      }
+      else if (pp) {
+        return false;
+      }
+      else {
+        return true;
+      }
+    }
+   }
+ 
+  // prunes outgoing links for all tokens in active_toks_[frame]
+  // it's called by PruneActiveTokens
+  // all links, that have link_extra_cost > lattice_beam are pruned
+  void PruneForwardLinks(int32 frame, bool *extra_costs_changed,
+                         bool *links_pruned,
+                         BaseFloat delta) {
+    // delta is the amount by which the extra_costs must change
+    // If delta is larger,  we'll tend to go back less far
+    //    toward the beginning of the file.
+    // extra_costs_changed is set to true if extra_cost was changed for any token
+    // links_pruned is set to true if any link in any token was pruned
+
+    *extra_costs_changed = false;
+    *links_pruned = false;
+    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
+    if (active_toks_[frame].toks == NULL ) { // empty list; should not happen.
+      if (!warned_) {
+        KALDI_WARN << "No tokens alive [doing pruning].. warning first "
+            "time only for each utterance\n";
+        warned_ = true;
+      }
+    }
+    
+    // We have to iterate until there is no more change, because the links
+    // are not guaranteed to be in topological order.
+    bool changed = true; // difference new minus old extra cost >= delta ?
+    while (changed) {
+      changed = false;
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        ForwardLink *link, *prev_link=NULL;
+        // will recompute tok_extra_cost for tok.
+        BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+        // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
+        for (link = tok->links; link != NULL; ) {
+          // See if we need to excise this link...
+          Token *next_tok = link->next_tok;
+          BaseFloat link_extra_cost = next_tok->extra_cost +
+              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+               - next_tok->tot_cost); // difference in brackets is >= 0
+          // link_exta_cost is the difference in score between the best paths
+          // through link source state and through link destination state
+          KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN
+          if (link_extra_cost > config_.lattice_beam) { // excise link
+            ForwardLink *next_link = link->next;
+            if (prev_link != NULL) prev_link->next = next_link;
+            else tok->links = next_link;
+            delete link;
+            link = next_link; // advance link but leave prev_link the same.
+            *links_pruned = true;
+          } else { // keep the link and update the tok_extra_cost if needed.
+            if (link_extra_cost < 0.0) { // this is just a precaution.
+              if (link_extra_cost < -0.01)
+                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+              link_extra_cost = 0.0;
+            }
+            if (link_extra_cost < tok_extra_cost)
+              tok_extra_cost = link_extra_cost;
+            prev_link = link; // move to next link
+            link = link->next;
+          }
+        } // for all outgoing links
+        if (fabs(tok_extra_cost - tok->extra_cost) > delta)
+          changed = true;  // difference new minus old is bigger than delta
+        tok->extra_cost = tok_extra_cost;
+        // will be +infinity or <= lattice_beam_.
+        // infinity indicates, that no forward link survived pruning
+      } // for all Token on active_toks_[frame]
+      if (changed) *extra_costs_changed = true;
+
+      // Note: it's theoretically possible that aggressive compiler
+      // optimizations could cause an infinite loop here for small delta and
+      // high-dynamic-range scores.
+    } // while changed
+  }
+
+  // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
+  // on the final frame.  If there are final tokens active, it uses
+  // the final-probs for pruning, otherwise it treats all tokens as final.
+  void PruneForwardLinksFinal(int32 frame) {
+    KALDI_ASSERT(static_cast<size_t>(frame+1) == active_toks_.size());
+    if (active_toks_[frame].toks == NULL ) // empty list; should not happen.
+      KALDI_WARN << "No tokens alive at end of file\n";
+
+    // First go through, working out the best token (do it in parallel
+    // including final-probs and not including final-probs; we'll take
+    // the one with final-probs if it's valid).
+    const BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
+    BaseFloat best_cost_final = infinity,
+        best_cost_nofinal = infinity;
+    unordered_map<Token*, BaseFloat> tok_to_final_cost;
+    Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
+    for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
+      PairId state_pair = e->key;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      Token *tok = e->val;
+      BaseFloat final_cost = fst_.Final(state).Value() +
+          lm_diff_fst_->Final(lm_state).Value();
+      tok_to_final_cost[tok] = final_cost;
+      best_cost_final = std::min(best_cost_final, tok->tot_cost + final_cost);
+      best_cost_nofinal = std::min(best_cost_nofinal, tok->tot_cost);
+      e_tail = e->tail;
+      toks_.Delete(e);
+    }
+    final_active_ = (best_cost_final != infinity);
+    
+    // Now go through tokens on this frame, pruning forward links...  may have
+    // to iterate a few times until there is no more change, because the list is
+    // not in topological order.
+
+    bool changed = true;
+    BaseFloat delta = 1.0e-05;
+    while (changed) {
+      changed = false;
+      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
+        ForwardLink *link, *prev_link=NULL;
+        // will recompute tok_extra_cost.  It has a term in it that corresponds
+        // to the "final-prob", so instead of initializing tok_extra_cost to infinity
+        // below we set it to the difference between the (score+final_prob) of this token,
+        // and the best such (score+final_prob).
+        BaseFloat tok_extra_cost;
+        if (final_active_) {
+          BaseFloat final_cost = tok_to_final_cost[tok];
+          tok_extra_cost = (tok->tot_cost + final_cost) - best_cost_final;
+        } else 
+          tok_extra_cost = tok->tot_cost - best_cost_nofinal;
+      
+        for (link = tok->links; link != NULL; ) {
+          // See if we need to excise this link...
+          Token *next_tok = link->next_tok;
+          BaseFloat link_extra_cost = next_tok->extra_cost +
+              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+               - next_tok->tot_cost);
+          if (link_extra_cost > config_.lattice_beam) { // excise link
+            ForwardLink *next_link = link->next;
+            if (prev_link != NULL) prev_link->next = next_link;
+            else tok->links = next_link;
+            delete link;
+            link = next_link; // advance link but leave prev_link the same.
+          } else { // keep the link and update the tok_extra_cost if needed.
+            if (link_extra_cost < 0.0) { // this is just a precaution.
+              if (link_extra_cost < -0.01)
+                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+              link_extra_cost = 0.0;
+            }
+            if (link_extra_cost < tok_extra_cost)
+              tok_extra_cost = link_extra_cost;
+            prev_link = link;
+            link = link->next;
+          }
+        }
+        // prune away tokens worse than lattice_beam above best path.  This step
+        // was not necessary in the non-final case because then, this case
+        // showed up as having no forward links.  Here, the tok_extra_cost has
+        // an extra component relating to the final-prob.
+        if (tok_extra_cost > config_.lattice_beam)
+          tok_extra_cost = infinity;
+          // to be pruned in PruneTokensForFrame
+
+        if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta))
+          changed = true;
+        tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
+      }
+    } // while changed
+
+    // Now put surviving Tokens in the final_costs_ hash, which is a class
+    // member (unlike tok_to_final_costs).
+    for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {    
+      if (tok->extra_cost != infinity) {
+        // If the token was not pruned away, 
+        if (final_active_) {
+          BaseFloat final_cost = tok_to_final_cost[tok];         
+          if (final_cost != infinity)
+            final_costs_[tok] = final_cost;
+        } else {
+          final_costs_[tok] = 0;
+        }
+      }
+    }
+  }
+  
+  // Prune away any tokens on this frame that have no forward links.
+  // [we don't do this in PruneForwardLinks because it would give us
+  // a problem with dangling pointers].
+  // It's called by PruneActiveTokens if any forward links have been pruned
+  void PruneTokensForFrame(int32 frame) {
+    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
+    Token *&toks = active_toks_[frame].toks;
+    if (toks == NULL)
+      KALDI_WARN << "No tokens alive [doing pruning]\n";
+    Token *tok, *next_tok, *prev_tok = NULL;
+    for (tok = toks; tok != NULL; tok = next_tok) {
+      next_tok = tok->next;
+      if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
+        // token is unreachable from end of graph; (no forward links survived)
+        // excise tok from list and delete tok.
+        if (prev_tok != NULL) prev_tok->next = tok->next;
+        else toks = tok->next;
+        delete tok;
+        num_toks_--;
+      } else { // fetch next Token
+        prev_tok = tok;
+      }
+    }
+  }
+  
+  // Go backwards through still-alive tokens, pruning them.  note: cur_frame is
+  // where hash toks_ are (so we do not want to mess with it because these tokens
+  // don't yet have forward pointers), but we do all previous frames, unless we
+  // know that we can safely ignore them because the frame after them was unchanged.
+  // delta controls when it considers a cost to have changed enough to continue
+  // going backward and propagating the change.
+  // for a larger delta, we will recurse less far back
+  void PruneActiveTokens(int32 cur_frame, BaseFloat delta) {
+    int32 num_toks_begin = num_toks_;
+    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
+      // Reason why we need to prune forward links in this situation:
+      // (1) we have never pruned them (new TokenList)
+      // (2) we have not yet pruned the forward links to the next frame,
+      // after any of those tokens have changed their extra_cost.
+      if (active_toks_[frame].must_prune_forward_links) {
+        bool extra_costs_changed = false, links_pruned = false;
+        PruneForwardLinks(frame, &extra_costs_changed, &links_pruned, delta);
+        if (extra_costs_changed && frame > 0) // any token has changed extra_cost
+          active_toks_[frame-1].must_prune_forward_links = true;
+        if (links_pruned) // any link was pruned
+          active_toks_[frame].must_prune_tokens = true;
+        active_toks_[frame].must_prune_forward_links = false; // job done
+      }
+      if (frame+1 < cur_frame &&      // except for last frame (no forward links)
+         active_toks_[frame+1].must_prune_tokens) {
+        PruneTokensForFrame(frame+1);
+        active_toks_[frame+1].must_prune_tokens = false;
+      }
+    }
+    KALDI_VLOG(3) << "PruneActiveTokens: pruned tokens from " << num_toks_begin
+                  << " to " << num_toks_;
+  }
+
+  // Version of PruneActiveTokens that we call on the final frame.
+  // Takes into account the final-prob of tokens.
+  void PruneActiveTokensFinal(int32 cur_frame) {
+    // returns true if there were final states active
+    // else returns false and treats all states as final while doing the pruning
+    // (this can be useful if you want partial lattice output,
+    // although it can be dangerous, depending what you want the lattices for).
+    // final_active_ and final_probs_ (a hash) are set internally
+    // by PruneForwardLinksFinal
+    int32 num_toks_begin = num_toks_;
+    PruneForwardLinksFinal(cur_frame); // prune final frame (with final-probs)
+    // sets final_active_ and final_probs_
+    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
+      bool b1, b2; // values not used.
+      BaseFloat dontcare = 0.0; // delta of zero means we must always update
+      PruneForwardLinks(frame, &b1, &b2, dontcare);
+      PruneTokensForFrame(frame+1);
+    }
+    PruneTokensForFrame(0); 
+    KALDI_VLOG(3) << "PruneActiveTokensFinal: pruned tokens from " << num_toks_begin
+                  << " to " << num_toks_;
+  }
+  
+  /// Gets the weight cutoff.  Also counts the active tokens.
+  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
+                      BaseFloat *adaptive_beam, Elem **best_elem) {
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(),
+                       tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0) min_active_cutoff = best_weight;
+      else {
+        std::nth_element(tmp_array_.begin(),
+                         tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
+                         tmp_array_.begin() + config_.max_active :
+                         tmp_array_.end());
+        min_active_cutoff = tmp_array_[config_.min_active];
+      }
+    }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+  }
+  }
+
+  inline StateId PropagateLm(StateId lm_state,
+                             Arc *arc, bool *pp=NULL) { // returns new LM state.
+    if (arc->olabel == 0) {
+      if (pp) *pp=false;
+      return lm_state; // no change in LM state if no word crossed.
+    } else { // Propagate in the LM-diff FST.
+      if (pp) *pp=false;
+      Arc lm_arc;
+      bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
+      if (!ans) { // this case is unexpected for statistical LMs.
+        if (!warned_noarc_) {
+          warned_noarc_ = true;
+          KALDI_WARN << "No arc available in LM (unlikely to be correct "
+              "if a statistical language model); will not warn again";
+        }
+        arc->weight = Weight::Zero();
+        return lm_state; // doesn't really matter what we return here; will
+        // be pruned.
+      } else {
+        arc->weight = Times(arc->weight, lm_arc.weight);
+        arc->olabel = lm_arc.olabel; // probably will be the same.
+        return lm_arc.nextstate; // return the new LM state.
+      }      
+    }
+  }
+  
+  void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
+    // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
+    Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
+    DeleteElems_1(toks_g1.Clear());
+    Elem *best_elem = NULL;
+    BaseFloat adaptive_beam;
+    size_t tok_cnt;
+    BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
+    PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
+    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
+                << adaptive_beam << "\t" << cur_cutoff;
+
+  
+    BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
+    // pruning "online" before having seen all tokens
+
+    // First process the best token to get a hopefully
+    // reasonably tight bound on the next cutoff.
+    if (best_elem) {
+      PairId state_pair = best_elem->key;
+      StateId state = PairToState(state_pair), // state in "fst"
+          lm_state = PairToLmState(state_pair);
+      Token *tok = best_elem->val;
+      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+           !aiter.Done();
+           aiter.Next()) {
+        Arc arc = aiter.Value();
+        if (arc.ilabel != 0) {  // propagate..
+          PropagateLm(lm_state, &arc); // may affect "arc.weight".
+          // We don't need the return value (the new LM state).
+          arc.weight = Times(arc.weight,
+                             Weight(-decodable->LogLikelihood(frame-1, arc.ilabel)));
+          BaseFloat new_weight = arc.weight.Value() + tok->tot_cost;
+          if (new_weight + adaptive_beam < next_cutoff)
+            next_cutoff = new_weight + adaptive_beam;
+        }
+      }
+    }
+    
+    // the tokens are now owned here, in last_toks, and the hash is empty.
+    // 'owned' is a complex thing here; the point is we need to call DeleteElem
+    // on each elem 'e' to let toks_ know we're done with them.
+    for (Elem *e = last_toks, *e_tail; e != NULL; e = e_tail) {
+      // loop this way because we delete "e" as we go.
+      PairId state_pair = e->key;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      Token *tok = e->val;
+      if (tok->tot_cost <=  cur_cutoff) {
+        for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+             !aiter.Done();
+             aiter.Next()) {
+          const Arc &arc_ref = aiter.Value();
+          if (arc_ref.ilabel != 0) {  // propagate..
+            Arc arc(arc_ref);
+            bool pp=arc.olabel>0;
+            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
+            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
+            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
+            BaseFloat graph_cost = arc.weight.Value(),
+                cur_cost = tok->tot_cost,
+                tot_cost = cur_cost + ac_cost + graph_cost;
+            if (tot_cost > next_cutoff) continue;
+            else if (tot_cost + config_.beam < next_cutoff)
+              next_cutoff = tot_cost + config_.beam; // prune by best current token
+            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
+            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
+            // true: emitting, NULL: no change indicator needed
+          
+            // Add ForwardLink from tok to next_tok (put on head of list tok->links)
+            tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
+                                         graph_cost, ac_cost, tok->links);
+          }
+        } // for all arcs
+      }
+      e_tail = e->tail;
+      toks_.Delete(e); // delete Elem
+    }
+  }
+
+  void ProcessNonemitting(int32 frame) {
+    // note: "frame" is the same as emitting states just processed.
+    
+    // Processes nonemitting arcs for one frame.  Propagates within toks_.
+    // Note-- this queue structure is is not very optimal as
+    // it may cause us to process states unnecessarily (e.g. more than once),
+    // but in the baseline code, turning this vector into a set to fix this
+    // problem did not improve overall speed.
+
+    KALDI_ASSERT(queue_.empty());
+    BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+    for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
+      queue_.push_back(e->key);
+      // for pruning with current best token
+      best_cost = std::min(best_cost, static_cast<BaseFloat>(e->val->tot_cost));
+    }
+    if (queue_.empty()) {
+      if (!warned_) {
+        KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
+                  << frame;
+        warned_ = true;
+      }
+    }
+    BaseFloat cutoff = best_cost + config_.beam;
+    
+    while (!queue_.empty()) {
+      PairId state_pair = queue_.back();
+      queue_.pop_back();
+
+      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not in
+                                                 // toks_ but this can't happen.
+      BaseFloat cur_cost = tok->tot_cost;
+      if (cur_cost > cutoff) // Don't bother processing successors.
+        continue;
+      StateId state = PairToState(state_pair),
+          lm_state = PairToLmState(state_pair);
+      // If "tok" has any existing forward links, delete them,
+      // because we're about to regenerate them.  This is a kind
+      // of non-optimality (remember, this is the simple decoder),
+      // but since most states are emitting it's not a huge issue.
+      tok->DeleteForwardLinks(); // necessary when re-visiting
+      tok->links = NULL;
+      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
+          !aiter.Done();
+          aiter.Next()) {
+        const Arc &arc_ref = aiter.Value();
+        if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
+          Arc arc(arc_ref);
+          bool pp=arc.olabel>0;
+          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
+          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
+          BaseFloat graph_cost = arc.weight.Value(),
+              tot_cost = cur_cost + graph_cost;
+          if (tot_cost < cutoff) {
+            bool changed;
+            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
+            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
+                                            false, &changed); // false: non-emit
+            
+            tok->links = new ForwardLink(new_tok, 0, arc.olabel,
+                                         graph_cost, 0, tok->links);
+            
+            // "changed" tells us whether the new token has a different
+            // cost from before, or is new [if so, add into queue].
+            if (changed) queue_.push_back(next_pair);
+          }
+        }
+      } // for all arcs
+    } // while queue not empty
+  }
+
+
+  // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
+  // more than one list (e.g. for current and previous frames), but only one of
+  // them at a time can be indexed by StateId.
+  HashList<PairId, Token*> toks_;
+  HashList<StateId, BaseFloat> toks_g1;
+  std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
+  // frame (members of TokenList are toks, must_prune_forward_links,
+  // must_prune_tokens).
+  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
+  // make it class member to avoid internal new/delete.
+  const fst::Fst<fst::StdArc> &fst_;
+  fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst_;  
+  LatticeBiglmFasterDecoderConfig config_;
+  bool warned_noarc_;  
+  int32 num_toks_; // current total #toks allocated...
+  bool warned_;
+  bool final_active_; // use this to say whether we found active final tokens
+  // on the last frame.
+  std::map<Token*, BaseFloat> final_costs_; // A cache of final-costs
+  // of tokens on the last frame-- it's just convenient to store it this way.
+  
+  // It might seem unclear why we call DeleteElems(toks_.Clear()).
+  // There are two separate cleanup tasks we need to do at when we start a new file.
+  // one is to delete the Token objects in the list; the other is to delete
+  // the Elem objects.  toks_.Clear() just clears them from the hash and gives ownership
+  // to the caller, who then has to call toks_.Delete(e) for each one.  It was designed
+  // this way for convenience in propagating tokens from one frame to the next.
+  void DeleteElems(Elem *list) {
+    for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_.Delete(e);
+    }
+    toks_.Clear();
+    DeleteElems_1(toks_g1.Clear());
+  }
+  void DeleteElems_1(Elem_g1 *list) {
+    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
+      e_tail = e->tail;
+      toks_g1.Delete(e);
+    }
+    toks_g1.Clear();
+  }
+  
+  void ClearActiveTokens() { // a cleanup routine, at utt end/begin
+    for (size_t i = 0; i < active_toks_.size(); i++) {
+      // Delete all tokens alive on this frame, and any forward
+      // links they may have.
+      for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
+        tok->DeleteForwardLinks();
+        Token *next_tok = tok->next;
+        delete tok;
+        num_toks_--;
+        tok = next_tok;
+      }
+    }
+    active_toks_.clear();
+    KALDI_ASSERT(num_toks_ == 0);
+  }
+};
+
+} // end namespace kaldi.
+
+#endif

From 3467908a33439892511ffff257645ffba18d7fd3 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 17:25:49 -0700
Subject: [PATCH 68/93] remove map in LM, comes to 1.30;
 exp_dec/fasterlm.1b2/dec.log

---
 .../lattice-otfres-biglm-faster-decoder.h     |  2 +-
 src/lm/faster-arpa-lm.h                       | 92 +++++++++----------
 2 files changed, 44 insertions(+), 50 deletions(-)

diff --git a/src/decoder/lattice-otfres-biglm-faster-decoder.h b/src/decoder/lattice-otfres-biglm-faster-decoder.h
index 841547b9cca..ac682024ccb 100644
--- a/src/decoder/lattice-otfres-biglm-faster-decoder.h
+++ b/src/decoder/lattice-otfres-biglm-faster-decoder.h
@@ -355,7 +355,7 @@ class LatticeBiglmFasterDecoder {
       return tok;
     }
   }
-#define res_beam 1
+#define res_beam 0.5
    inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
                                bool emitting, bool *changed, bool pp) {
     // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index fab511c0ea5..808d56008b8 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -47,7 +47,7 @@ class FasterArpaLm {
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
    public:
-    LmState(): logprob_(0), h_value(0), next(NULL) { }
+    LmState(): logprob_(0), h_value(0), word_ids_(NULL), next(NULL) { }
     LmState(float logprob, float backoff_logprob): 
       logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
     next(NULL) { }
@@ -60,14 +60,21 @@ class FasterArpaLm {
       int32 sz= sizeof(int32)*(ngram_order);
       */
     }
+    void SaveWordIds(const int32 *word_ids, const int32 ngram_order) {
+      word_ids_ = (int32 *)malloc(sizeof(int32)*ngram_order);
+      for (int i=0; i<ngram_order; i++) word_ids_[i] = word_ids[i];
+      ngram_order_ = ngram_order;
+    }
     bool IsExist() const { return logprob_!=0; }
-    ~LmState() { }
+    ~LmState() { if (word_ids_) free(word_ids_); }
 
     // for current query
     float logprob_;
     // for next query; can be optional
     float backoff_logprob_;
     RAND_TYPE h_value;
+    int32 *word_ids_;
+    int32 ngram_order_;
     LmState* next; // for colid
   };
 
@@ -164,8 +171,10 @@ class FasterArpaLm {
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
       ngrams_[hashed_idx] = lm_state_pattern;
+      ngrams_[hashed_idx].SaveWordIds(word_ids, ngram_order);
     } else {
       ngrams_[ngrams_saved_num_] = lm_state_pattern;
+      ngrams_[ngrams_saved_num_].SaveWordIds(word_ids, ngram_order);
       InsertHash(hashed_idx, ngrams_saved_num_++);
     }
   }
@@ -183,24 +192,27 @@ class FasterArpaLm {
 
 
   inline const LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order) const {
+      int query_ngram_order, int32 *lm_state_idx=NULL) const {
     RAND_TYPE h_value;
+    LmState *ret_lm_state = NULL;
     int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
-      return &ngrams_[hashed_idx];
+      ret_lm_state = &ngrams_[hashed_idx];
     } else {
       LmState *lm_state = ngrams_map_[hashed_idx];
       while (lm_state) {
         if (lm_state->h_value == h_value) {
-          return lm_state;
+          ret_lm_state = lm_state;
+          break;
         }
         lm_state = lm_state->next;
       }
     }
+    if (ret_lm_state && lm_state_idx) *lm_state_idx = ret_lm_state - ngrams_;
    
     // not found, can be bug or really not found the corresponding ngram 
-    return NULL;
+    return ret_lm_state;
   }
   inline const LmState* GetHashedState(const std::vector<int32> &word_ids, 
        bool reverse = false, int query_ngram_order = 0) const {
@@ -215,9 +227,15 @@ class FasterArpaLm {
 
   // if exist, get logprob_, else get backoff_logprob_
   // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
+  inline void GetWordIdsByLmStateIdx(int32 **word_ids, 
+      int32 *word_ngram_order, int32 lm_state_idx) const {
+    *word_ids = ngrams_[lm_state_idx].word_ids_;
+    *word_ngram_order = ngrams_[lm_state_idx].ngram_order_;
+  }
+
   inline float GetNgramLogprob(const int32 *word_ids, 
       const int32 word_ngram_order, 
-      std::vector<int32>& o_word_ids) const {
+      int32 *lm_state_idx) const {
     float prob;
     int32 ngram_order = word_ngram_order;
     assert(ngram_order > 0);
@@ -245,14 +263,8 @@ class FasterArpaLm {
   */    
       // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
       ngram_order = std::min(ngram_order,ngram_order_-1);
-      while(!GetHashedState(word_ids, ngram_order)) ngram_order--;
+      while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
       assert(ngram_order>0);
-
-      o_word_ids.resize(ngram_order);
-      for (int i=0; i<ngram_order; i++) {
-        o_word_ids[i] = word_ids[i]; 
-      }
-
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
@@ -260,7 +272,7 @@ class FasterArpaLm {
       //assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 84746 4447 8537 without 4447 8537 in LM
 
       prob = lm_state_bo? lm_state_bo->backoff_logprob_:0;
-      prob += GetNgramLogprob(word_ids, ngram_order - 1, o_word_ids);
+      prob += GetNgramLogprob(word_ids, ngram_order - 1, lm_state_idx);
     }
     return prob;
   }
@@ -343,8 +355,6 @@ class FasterArpaLm {
   // Size of the <lm_states_> array, which will be needed by I/O.
   int64 lm_states_size_;
   // Hash table from word sequences to LmStates.
-  unordered_map<std::vector<int32>,
-                LmState*, VectorHasher<int32> > seq_to_state_;
   ArpaParseOptions &options_;
 
   // data
@@ -376,10 +386,10 @@ class FasterArpaLmDeterministicFst
 
   explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
     start_state_(0), lm_(lm) { 
+      // TODO
     // Creates a history state for <s>.
-    std::vector<Label> bos_state(1, lm_.BosSymbol());
-    state_to_wseq_.push_back(bos_state);
-    wseq_to_state_[bos_state] = 0;
+    int32 word_ids = lm_.BosSymbol();
+    lm_.GetNgramLogprob(&word_ids, 1, &start_state_);
   }
 
   // We cannot use "const" because the pure virtual function in the interface is
@@ -390,16 +400,18 @@ class FasterArpaLmDeterministicFst
   // not const.
   virtual Weight Final(StateId s) {
     // At this point, we should have created the state.
-    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
-    const std::vector<Label>& wseq = state_to_wseq_[s];
-    std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, lm_.EosSymbol(), owseq);
+    int32 lm_state_idx;
+    float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
     return Weight(-logprob);
   }
 
-  float GetNgramLogprob(const std::vector<int32> &wseq, int32 ilabel,
-    std::vector<int32> &owseq) {
-    int32 n = wseq.size();
+  float GetNgramLogprob(const int32 pre_lm_state_idx, int32 ilabel,
+      int32 *lm_state_idx) {
+    int32 *wseq;
+    int32 wseq_order;
+    lm_.GetWordIdsByLmStateIdx(&wseq, &wseq_order, pre_lm_state_idx);
+    int32 n = wseq_order;
+    assert(n>0);
     int32 word_ids[MAX_NGRAM];
     assert(n+1 <= MAX_NGRAM);
 
@@ -408,46 +420,28 @@ class FasterArpaLmDeterministicFst
       word_ids[i+1] = wseq[i];
     }
 
-    return lm_.GetNgramLogprob(word_ids, n+1, owseq);
+    return lm_.GetNgramLogprob(word_ids, n+1, lm_state_idx);
   }
   virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
     // At this point, we should have created the state.
-    KALDI_ASSERT(static_cast<size_t>(s) < state_to_wseq_.size());
 
-    std::vector<Label> wseq = state_to_wseq_[s];
-    std::vector<Label> owseq;
-    float logprob = GetNgramLogprob(wseq, ilabel, owseq);
+    int32 lm_state_idx;
+    float logprob = GetNgramLogprob(s, ilabel, &lm_state_idx);
     if (logprob == std::numeric_limits<float>::min()) {
       return false;
     }
 
-    std::pair<const std::vector<Label>, StateId> wseq_state_pair(
-        owseq, static_cast<Label>(state_to_wseq_.size()));
-
-    // Attemps to insert the current <wseq_state_pair>. If the pair already exists
-    // then it returns false.
-    typedef MapType::iterator IterType;
-    std::pair<IterType, bool> result = wseq_to_state_.insert(wseq_state_pair);
-
-    // If the pair was just inserted, then also add it to <state_to_wseq_>.
-    if (result.second == true)
-      state_to_wseq_.push_back(owseq);
-
     // Creates the arc.
     oarc->ilabel = ilabel;
     oarc->olabel = ilabel;
-    oarc->nextstate = result.first->second;
+    oarc->nextstate = lm_state_idx;
     oarc->weight = Weight(-logprob);
 
     return true;
   }
 
  private:
-  typedef unordered_map<std::vector<Label>,
-                        StateId, VectorHasher<Label> > MapType;
   StateId start_state_;
-  MapType wseq_to_state_;
-  std::vector<std::vector<Label> > state_to_wseq_;
 
   const FasterArpaLm& lm_;
 };

From 4fdc241b609bddfeea32dbad66f010c8a19a78d1 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 17:45:53 -0700
Subject: [PATCH 69/93] mod timer

---
 src/bin/latgen-fasterlm-faster-mapped.cc        | 4 ++--
 src/bin/latgen-otfres-fasterlm-faster-mapped.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index be2fce3e01d..ebdbe024ad0 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     kaldi::int64 frame_count = 0;
     int num_success = 0, num_fail = 0;
 
-
+    double elapsed=0;
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
       // Input FST is just one FST, not a table of FSTs.
@@ -279,13 +279,13 @@ int main(int argc, char *argv[]) {
             num_success++;
           } else num_fail++;
         }
+        elapsed = timer.Elapsed();
       }
       delete decode_fst; // delete this only after decoder goes out of scope.
     } else { // We have different FSTs for different utterances.
       assert(0);
     }
       
-    double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);
diff --git a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
index ad475f9405f..9f37cf8a331 100644
--- a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
@@ -246,7 +246,7 @@ int main(int argc, char *argv[]) {
     kaldi::int64 frame_count = 0;
     int num_success = 0, num_fail = 0;
 
-
+    double elapsed=0;
     if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
       SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
       // Input FST is just one FST, not a table of FSTs.
@@ -279,13 +279,13 @@ int main(int argc, char *argv[]) {
             num_success++;
           } else num_fail++;
         }
+        elapsed = timer.Elapsed();
       }
       delete decode_fst; // delete this only after decoder goes out of scope.
     } else { // We have different FSTs for different utterances.
       assert(0);
     }
       
-    double elapsed = timer.Elapsed();
     KALDI_LOG << "Time taken "<< elapsed
               << "s: real-time factor assuming 100 frames/sec is "
               << (elapsed*100.0/frame_count);

From dcd98ad75368b7228770517415fd7ea9aab6cdda Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sat, 14 Apr 2018 18:35:23 -0700
Subject: [PATCH 70/93] tiny

---
 src/lm/faster-arpa-lm.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 808d56008b8..f796478a9f6 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -249,7 +249,7 @@ class FasterArpaLm {
       ngram_order = ngram_order_;
     }
 
-    const LmState *lm_state = GetHashedState(word_ids, ngram_order);
+    const LmState *lm_state = GetHashedState(word_ids, ngram_order, lm_state_idx);
     if (lm_state) { //found out
       assert(lm_state->IsExist());
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
@@ -260,11 +260,17 @@ class FasterArpaLm {
         std::cout<<word_ids[i]<<" ";
       }
       std::cout<<ngram_order<<" "<<prob<<"\n";
-  */    
-      // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state
-      ngram_order = std::min(ngram_order,ngram_order_-1);
-      while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
-      assert(ngram_order>0);
+  */   
+#define IMPROVE_RECOMBINE
+#ifdef IMPROVE_RECOMBINE
+      if (ngram_order > ngram_order_-1) {
+        ngram_order = ngram_order_-1;
+        // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state; 
+        // however, it wastes some hashing if we never use the nextstate
+        while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
+        assert(ngram_order>0);
+      }
+#endif
     } else {
       assert(ngram_order > 1); // thus we can do backoff
       const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 

From 89ee7838366e6885506230f2da85e42ca88dd5e2 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Sun, 15 Apr 2018 12:40:46 -0700
Subject: [PATCH 71/93] add rescore tool; script/speedup_reslat.1b.sh; if [ 
 -le 4 ]; then       steps/lmrescore_fasterlm_arpa.sh --cmd run.pl        
 data/lang_test_tgsmall data/local/lm/3-gram.pruned.1e-7.arpa.gz        
 /decode_{tgsmall,tgmed}_ fi

---
 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh   |  65 +++++++++
 src/latbin/Makefile                           |   2 +
 src/latbin/lattice-lmrescore-fasterlm-arpa.cc | 135 ++++++++++++++++++
 3 files changed, 202 insertions(+)
 create mode 100644 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
 create mode 100644 src/latbin/lattice-lmrescore-fasterlm-arpa.cc

diff --git a/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh b/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
new file mode 100644
index 00000000000..941cd90664d
--- /dev/null
+++ b/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+# Copyright 2014  Guoguo Chen
+# Apache 2.0
+
+# This script rescores lattices with the ConstArpaLm format language model.
+
+# Begin configuration section.
+cmd=run.pl
+skip_scoring=false
+stage=1
+scoring_opts=
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+
+. ./utils/parse_options.sh
+
+if [ $# != 5 ]; then
+   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
+   echo "Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\"
+   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
+   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
+   exit 1;
+fi
+
+[ -f path.sh ] && . ./path.sh;
+
+oldlang=$1
+newlang=$2
+data=$3
+indir=$4
+outdir=$5
+
+oldlm=$oldlang/G.fst
+#newlm=$newlang/G.carpa
+newlm="gunzip -c $newlang| utils/map_arpa_lm.pl $oldlang/words.txt |"
+[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
+! ls $indir/lat.*.gz >/dev/null &&\
+  echo "$0: No lattices input directory $indir" && exit 1;
+
+
+oldlmcommand="fstproject --project_output=true $oldlm |"
+
+mkdir -p $outdir/log
+nj=`cat $indir/num_jobs` || exit 1;
+cp $indir/num_jobs $outdir
+
+if [ $stage -le 1 ]; then
+  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
+    lattice-lmrescore --lm-scale=-1.0 \
+    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
+    lattice-lmrescore-fasterlm-arpa --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3   --lm-scale=1.0 ark:- "$newlm" \
+     "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
+fi
+
+if ! $skip_scoring && [ $stage -le 2 ]; then
+  err_msg="Not scoring because local/score.sh does not exist or not executable."
+  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
+  local/score.sh --cmd "$cmd" $scoring_opts $data $oldlang $outdir
+else
+  echo "Not scoring because requested so..."
+fi
+
+exit 0;
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index bcffbb43168..813986b63ec 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -27,6 +27,8 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned
 
+BINFILES += lattice-lmrescore-fasterlm-arpa
+
 OBJFILES =
 
 
diff --git a/src/latbin/lattice-lmrescore-fasterlm-arpa.cc b/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
new file mode 100644
index 00000000000..85679dcd15b
--- /dev/null
+++ b/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
@@ -0,0 +1,135 @@
+// latbin/lattice-lmrescore-const-arpa.cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "fstext/fstext-lib.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+#include "lm/faster-arpa-lm.h"
+#include "util/common-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Rescores lattice with the ConstArpaLm format language model. The LM\n"
+        "will be wrapped into the DeterministicOnDemandFst interface and the\n"
+        "rescoring is done by composing with the wrapped LM using a special\n"
+        "type of composition algorithm. Determinization will be applied on\n"
+        "the composed lattice.\n"
+        "\n"
+        "Usage: lattice-lmrescore-const-arpa [options] lattice-rspecifier \\\n"
+        "                                   const-arpa-in lattice-wspecifier\n"
+        " e.g.: lattice-lmrescore-const-arpa --lm-scale=-1.0 ark:in.lats \\\n"
+        "                                   const_arpa ark:out.lats\n";
+
+    ParseOptions po(usage);
+    BaseFloat lm_scale = 1.0;
+    int32 symbol_size = 0;
+
+    po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
+                "costs; frequently 1.0 or -1.0");
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string lats_rspecifier = po.GetArg(1),
+        lm_rxfilename = po.GetArg(2),
+        lats_wspecifier = po.GetArg(3);
+
+    // Reads the language model in FasterArpaLm format.
+    FasterArpaLm new_lm(arpa_options, lm_rxfilename, symbol_size);
+    FasterArpaLmDeterministicFst const_arpa_fst(new_lm);
+
+
+    // Reads and writes as compact lattice.
+    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
+    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
+
+    int32 n_done = 0, n_fail = 0;
+    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
+      std::string key = compact_lattice_reader.Key();
+      CompactLattice clat = compact_lattice_reader.Value();
+      compact_lattice_reader.FreeCurrent();
+
+      if (lm_scale != 0.0) {
+        // Before composing with the LM FST, we scale the lattice weights
+        // by the inverse of "lm_scale".  We'll later scale by "lm_scale".
+        // We do it this way so we can determinize and it will give the
+        // right effect (taking the "best path" through the LM) regardless
+        // of the sign of lm_scale.
+        fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &clat);
+        ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>());
+
+        // Wraps the ConstArpaLm format language model into FST. We re-create it
+        // Composes lattice with language model.
+        CompactLattice composed_clat;
+        ComposeCompactLatticeDeterministic(clat,
+                                           &const_arpa_fst, &composed_clat);
+
+        // Determinizes the composed lattice.
+        Lattice composed_lat;
+        ConvertLattice(composed_clat, &composed_lat);
+        Invert(&composed_lat);
+        CompactLattice determinized_clat;
+        DeterminizeLattice(composed_lat, &determinized_clat);
+        fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat);
+        if (determinized_clat.Start() == fst::kNoStateId) {
+          KALDI_WARN << "Empty lattice for utterance " << key
+              << " (incompatible LM?)";
+          n_fail++;
+        } else {
+          compact_lattice_writer.Write(key, determinized_clat);
+          n_done++;
+        }
+      } else {
+        // Zero scale so nothing to do.
+        n_done++;
+        compact_lattice_writer.Write(key, clat);
+      }
+    }
+
+    KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
+    return (n_done != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From 4555ec68ce89bf6a4a108eb93a2f85e1c4c50a55 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Wed, 25 Apr 2018 06:53:05 -0700
Subject: [PATCH 72/93] tunable HASH_SIZE

---
 src/lm/faster-arpa-lm.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index f796478a9f6..c0026a5a51b 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -41,6 +41,7 @@ uint64  RandInt64() {
 }
 #define MAX_NGRAM 5+1
 #define RAND_TYPE int64
+#define HASH_REDUNDANT 0.5
 class FasterArpaLm {
  public:
 
@@ -308,7 +309,7 @@ class FasterArpaLm {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
         ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + 0.5));
+                                 M_LN2 + HASH_REDUNDANT));
       }
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];

From 0b94e8fb25e00a6e0876f337c7ed46db3d1c4253 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@hsw213.cm.cluster>
Date: Wed, 25 Apr 2018 21:47:47 -0700
Subject: [PATCH 73/93] bug fix

---
 src/lm/faster-arpa-lm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index c0026a5a51b..6e5b5646402 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -50,7 +50,7 @@ class FasterArpaLm {
    public:
     LmState(): logprob_(0), h_value(0), word_ids_(NULL), next(NULL) { }
     LmState(float logprob, float backoff_logprob): 
-      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0),
+      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0), word_ids_(NULL),
     next(NULL) { }
     void Allocate(const NGram* ngram, float lm_scale=1) {
       logprob_ = ngram->logprob*lm_scale;
@@ -151,6 +151,7 @@ class FasterArpaLm {
     return hashed_idx;
   }
   inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
+    assert(hashed_idx < ngrams_map_.size());
     if (ngrams_map_.at(hashed_idx)) {
       LmState *lm_state = ngrams_map_[hashed_idx];
       int32 cnt=0;
@@ -201,6 +202,7 @@ class FasterArpaLm {
     if (ngram_order == 1) {
       ret_lm_state = &ngrams_[hashed_idx];
     } else {
+      assert(hashed_idx < ngrams_map_.size());
       LmState *lm_state = ngrams_map_[hashed_idx];
       while (lm_state) {
         if (lm_state->h_value == h_value) {

From 1f897198238df71d91323a82fc5f7695ea90f94e Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Thu, 26 Apr 2018 00:49:23 -0700
Subject: [PATCH 74/93] bug fix for ngrams

---
 src/lm/faster-arpa-lm.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 6e5b5646402..5cbc985cdba 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -99,7 +99,7 @@ class FasterArpaLm {
       lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
-    virtual void ReadComplete()  { }
+    virtual void ReadComplete()  {  }
 
    private:
     FasterArpaLm *lm_;
@@ -117,6 +117,10 @@ class FasterArpaLm {
     max_collision_ = 0;
 
     BuildFasterArpaLm(arpa_rxfilename, lm_scale);
+    assert(ngrams_num_ >= ngrams_saved_num_);
+    if (ngrams_num_ != ngrams_saved_num_) {
+      KALDI_WARN << "num mismatch in arpa header: "<<ngrams_num_<<" "<<ngrams_saved_num_;
+    }
     KALDI_VLOG(2) << max_collision_;
   }
 
@@ -163,6 +167,7 @@ class FasterArpaLm {
       max_collision_=std::max(cnt,max_collision_);
     } else {
       ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
+      assert(ngrams_saved_num_ < ngrams_num_);
     }
   }
   inline void SaveHashedState(const int32* word_ids, 
@@ -319,7 +324,7 @@ class FasterArpaLm {
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = RandInt64(); 
       }
-      acc+= ngram_count[i];
+      acc+= i==0? ngrams_hashed_size_[i]:ngram_count[i];
       acc_hashed+= ngrams_hashed_size_[i];
       if (i==0) ngrams_hashed_size_[i]=0;
       else ngrams_hashed_size_[i]+=ngrams_hashed_size_[i-1];
@@ -329,7 +334,8 @@ class FasterArpaLm {
     KALDI_VLOG(2) << " hashed_size/size = "<< 
         1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
     
-    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructor
+    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
+    ngrams_num_ = acc;
     ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
     ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
@@ -371,6 +377,7 @@ class FasterArpaLm {
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
   LmState* ngrams_;
   int32 ngrams_saved_num_;
+  int32 ngrams_num_;
 
   std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]

From 1d9548f757be34b2986f8be0e35bd15d6795765d Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@psgcluster.cm.cluster>
Date: Wed, 2 May 2018 02:41:28 -0700
Subject: [PATCH 75/93] add recombine; no imp

---
 src/lm/faster-arpa-lm.h | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 5cbc985cdba..7af7e798016 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -77,6 +77,7 @@ class FasterArpaLm {
     int32 *word_ids_;
     int32 ngram_order_;
     LmState* next; // for colid
+    int32 lm_state_reco_;
   };
 
   // Class to build FasterArpaLm from Arpa format language model. It relies on the
@@ -99,7 +100,9 @@ class FasterArpaLm {
       lm_->SaveHashedState(ngram.words, lm_state, true);
     }
 
-    virtual void ReadComplete()  {  }
+    virtual void ReadComplete()  { 
+      lm_->SaveRecoState();
+    }
 
    private:
     FasterArpaLm *lm_;
@@ -197,6 +200,19 @@ class FasterArpaLm {
     return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
   }
 
+  inline void SaveRecoState() {
+    for (int i=0; i<ngrams_num_; i++) {
+      int32 *word_ids = ngrams_[i].word_ids_;
+      int32 ngram_order = ngrams_[i].ngram_order_;
+      int32 lm_state_idx;
+      if (ngram_order > ngram_order_-1) {
+        ngram_order--;
+        while(!GetHashedState(word_ids, ngram_order, &lm_state_idx)) ngram_order--;
+        assert(ngram_order>0);
+      } else lm_state_idx = i; 
+      ngrams_[i].lm_state_reco_ = lm_state_idx;
+    }
+  }
 
   inline const LmState* GetHashedState(const int32* word_ids, 
       int query_ngram_order, int32 *lm_state_idx=NULL) const {
@@ -271,6 +287,9 @@ class FasterArpaLm {
   */   
 #define IMPROVE_RECOMBINE
 #ifdef IMPROVE_RECOMBINE
+#if 1
+      *lm_state_idx = lm_state->lm_state_reco_;
+#else
       if (ngram_order > ngram_order_-1) {
         ngram_order = ngram_order_-1;
         // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state; 
@@ -278,6 +297,7 @@ class FasterArpaLm {
         while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
         assert(ngram_order>0);
       }
+#endif
 #endif
     } else {
       assert(ngram_order > 1); // thus we can do backoff

From dc5324b76153910329317f6ced46ffdc3f01f76c Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c06.clsp.jhu.edu>
Date: Tue, 8 May 2018 09:08:32 -0400
Subject: [PATCH 76/93] tmp

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 63 +++++++++++++-----------
 1 file changed, 34 insertions(+), 29 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index ebdbe024ad0..2adc9668545 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -79,8 +79,9 @@ bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is rea
     num_frames = alignment.size();
     if (words_writer->IsOpen())
       words_writer->Write(utt, words);
-    if (alignment_writer->IsOpen())
-      alignment_writer->Write(utt, alignment);
+    assert(!alignment_writer);
+    //if (alignment_writer->IsOpen())
+    //  alignment_writer->Write(utt, alignment);
     if (word_syms != NULL) {
       std::cerr << utt << ' ';
       for (size_t i = 0; i < words.size(); i++) {
@@ -184,20 +185,23 @@ int main(int argc, char *argv[]) {
     
     po.Read(argc, argv);
 
-    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
+    if (po.NumArgs() < 6 ) {
       po.PrintUsage();
       exit(1);
     }
-    
+   
+    int start_lm = 3;
+    int end_lm = po.NumArgs() - 3;
     std::string model_in_filename = po.GetArg(1),
         fst_in_str = po.GetArg(2),
-        old_lm_fst_rxfilename = po.GetArg(3),
-        new_lm_fst_rxfilename = po.GetArg(4),
-        feature_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-    
+        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
+        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
+        words_wspecifier = po.GetOptArg(po.NumArgs());
+ 
+    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
+    //old_lm_fst_rxfilename = po.GetArg(3),
+    //new_lm_fst_rxfilename = po.GetArg(4),   
+
     TransitionModel trans_model;
     ReadKaldiObject(model_in_filename, &trans_model);
 
@@ -207,22 +211,23 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
-#if 1
-    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-#else
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
-    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-#endif
-
-    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
-    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
-
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
+    std::vector<FasterArpaLm> lm_vec;
+    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
+    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    for ( int i = start_lm; i < end_lm; i+=2 ) {
+      std::string s_lm = po.GetArg(i);
+      float w =  atof(po.GetArg(i+1).c_str());
+      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
+      dlm_vec.emplace_back(lm_vec.back());
+      if (i == start_lm) continue;
+      else if (i == start_lm+2) {
+        clm_vec.emplace_back(&dlm_vec.at(dlm_vec.size()-2),&dlm_vec.back());
+      } else {
+        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
+      }
+    }
+    // multiple compose
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e7);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
@@ -234,7 +239,7 @@ int main(int argc, char *argv[]) {
 
     Int32VectorWriter words_writer(words_wspecifier);
 
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
+    //Int32VectorWriter alignment_writer(alignment_wspecifier);
 
     fst::SymbolTable *word_syms = NULL;
     if (word_syms_filename != "") 
@@ -271,7 +276,7 @@ int main(int argc, char *argv[]) {
           double like;
           if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
                               utt, acoustic_scale, determinize, allow_partial,
-                              &alignment_writer, &words_writer,
+                              NULL, &words_writer,
                               &compact_lattice_writer, &lattice_writer,
                               &like)) {
             tot_like += like;

From fa58d22742d92758764f843b26f216d7ad0454a0 Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c06.clsp.jhu.edu>
Date: Tue, 8 May 2018 10:21:56 -0400
Subject: [PATCH 77/93] support n lm

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index 2adc9668545..acb74d556ca 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -211,9 +211,13 @@ int main(int argc, char *argv[]) {
     FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
     ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
     */
+    int lm_num=(end_lm-start_lm+1)/2;
     std::vector<FasterArpaLm> lm_vec;
     std::vector<FasterArpaLmDeterministicFst> dlm_vec;
     std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    lm_vec.reserve(lm_num);
+    dlm_vec.reserve(lm_num);
+    clm_vec.reserve(lm_num-1);
     for ( int i = start_lm; i < end_lm; i+=2 ) {
       std::string s_lm = po.GetArg(i);
       float w =  atof(po.GetArg(i+1).c_str());
@@ -221,7 +225,7 @@ int main(int argc, char *argv[]) {
       dlm_vec.emplace_back(lm_vec.back());
       if (i == start_lm) continue;
       else if (i == start_lm+2) {
-        clm_vec.emplace_back(&dlm_vec.at(dlm_vec.size()-2),&dlm_vec.back());
+        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
       } else {
         clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
       }

From a4bd73b325b0d1e8f10e7972301f33789b33b071 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 10:07:30 +0800
Subject: [PATCH 78/93] support ctc by
 /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/debug.sh; TODO: fail to proc <s>

---
 src/bin/latgen-fasterlm-faster-mapped.cc   | 17 +++--
 src/decoder/decodable-matrix.h             | 78 ++++++++++++++++++++++
 src/decoder/lattice-biglm-faster-decoder.h |  2 +-
 3 files changed, 92 insertions(+), 5 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index acb74d556ca..c9f8f967dcc 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -160,11 +160,13 @@ int main(int argc, char *argv[]) {
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
     int32 symbol_size = 0;
+    bool ctc = false;
     LatticeBiglmFasterDecoderConfig config;
     config.Register(&po);
 
     ArpaParseOptions arpa_options;
     arpa_options.Register(&po);
+    po.Register("ctc", &ctc, "is ctc decoding");
     po.Register("symbol-size", &symbol_size, "symbol table size");
     po.Register("unk-symbol", &arpa_options.unk_symbol,
                 "Integer corresponds to unknown-word in language model. -1 if "
@@ -203,7 +205,8 @@ int main(int argc, char *argv[]) {
     //new_lm_fst_rxfilename = po.GetArg(4),   
 
     TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
+    if (!ctc)
+        ReadKaldiObject(model_in_filename, &trans_model);
 
     /*
     FasterArpaLm old_lm;
@@ -274,11 +277,17 @@ int main(int argc, char *argv[]) {
             num_fail++;
             continue;
           }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+         
+          DecodableInterface* decodable = NULL;
+          if (!ctc) 
+            decodable = new DecodableMatrixScaledMapped(trans_model, features, acoustic_scale);
+          else {
+            decodable = new DecodableMatrixScaledMappedCtc(features, acoustic_scale);
+            decoder.GetOptions().det_opts.phone_determinize = false; // disable DeterminizeLatticePhonePrunedFirstPass
+          }
 
           double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+          if (DecodeUtterance(decoder, *decodable, trans_model, word_syms,
                               utt, acoustic_scale, determinize, allow_partial,
                               NULL, &words_writer,
                               &compact_lattice_writer, &lattice_writer,
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index de70ea82753..2a4430cf61b 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -83,6 +83,51 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
 };
 
+class DecodableMatrixScaledMappedCtc: public DecodableInterface {
+ public:
+  // This constructor creates an object that will not delete "likes"
+  // when done.
+	DecodableMatrixScaledMappedCtc(const Matrix<BaseFloat> &likes,
+                              BaseFloat scale): likes_(&likes),
+                                                scale_(scale), delete_likes_(false) {
+
+  }
+
+  // This constructor creates an object that will delete "likes"
+  // when done.
+	DecodableMatrixScaledMappedCtc(BaseFloat scale,
+                              const Matrix<BaseFloat> *likes):
+      likes_(likes),
+      scale_(scale), delete_likes_(true) {
+
+  }
+
+  virtual int32 NumFramesReady() const { return likes_->NumRows(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+  // Note, frames are numbered from zero.
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    return scale_ * (*likes_)(frame, tid-1);
+  }
+
+  // Indices are one-based!  This is for compatibility with OpenFst.
+  virtual int32 NumIndices() const { return likes_->NumCols(); }
+
+  virtual ~DecodableMatrixScaledMappedCtc() {
+    if (delete_likes_) delete likes_;
+  }
+ private:
+  const Matrix<BaseFloat> *likes_;
+  BaseFloat scale_;
+  bool delete_likes_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMappedCtc);
+};
+
+
 /**
    This decodable class returns log-likes stored in a matrix; it supports
    repeatedly writing to the matrix and setting a time-offset representing the
@@ -199,6 +244,39 @@ class DecodableMatrixScaled: public DecodableInterface {
 };
 
 
+class DecodableMatrixScaledCtc: public DecodableInterface {
+ public:
+	DecodableMatrixScaledCtc(const Matrix<BaseFloat> &likes,
+                        BaseFloat scale): likes_(likes),
+                                          scale_(scale) { }
+
+  virtual int32 NumFramesReady() const { return likes_.NumRows(); }
+
+  virtual bool IsLastFrame(int32 frame) const {
+    KALDI_ASSERT(frame < NumFramesReady());
+    return (frame == NumFramesReady() - 1);
+  }
+
+  // Note, frames are numbered from zero. Here "tid" means token id, the indexes of the
+  // CTC label tokens. When we compile the search graph, the tokens are indexed from 1
+  // because 0 is always occupied by <eps>. However, in the softmax layer of the RNN
+  // model, CTC tokens are indexed from 0. Thus, we simply shift "tid" by 1, to solve
+  // the mismatch.
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
+    return scale_ * likes_(frame, tid-1);
+  }
+
+  // Indices are one-based!  This is for compatibility with OpenFst.
+  virtual int32 NumIndices() const { return likes_.NumCols(); }
+
+ private:
+  const Matrix<BaseFloat> &likes_;
+  BaseFloat scale_;
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledCtc);
+};
+
+
+
 }  // namespace kaldi
 
 #endif  // KALDI_DECODER_DECODABLE_MATRIX_H_
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index b13236c2970..75e8af9cf8c 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -66,7 +66,7 @@ class LatticeBiglmFasterDecoder {
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
-  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
+  LatticeBiglmFasterDecoderConfig& GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
     DeleteElems(toks_.Clear());    
     ClearActiveTokens();

From f2d8d3007621035f31e87816c50b8481aba3a7e5 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 10:42:36 +0800
Subject: [PATCH 79/93] bug fix

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index c9f8f967dcc..e5bc5c8956e 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -296,6 +296,7 @@ int main(int argc, char *argv[]) {
             frame_count += features.NumRows();
             num_success++;
           } else num_fail++;
+          delete decodable;
         }
         elapsed = timer.Elapsed();
       }

From a771246806f6d14b35e18b424f90cac1840e9317 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 17 May 2018 14:34:34 +0800
Subject: [PATCH 80/93] add comment

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index e5bc5c8956e..49e60851d7e 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -150,11 +150,16 @@ int main(int argc, char *argv[]) {
 
     const char *usage =
         "Generate lattices using on-the-fly composition.\n"
+        "e.g. HCLG_1 - G_1 + (G_2a \\dynamic_int G_2b) \n"
         "User supplies LM used to generate decoding graph, and desired LM;\n"
         "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+        "Usage: latgen-fasterlm-faster-mapped [options] model-in(for ctc, the model is ignored) HCLG-1-fstin "
+        "G-1-oldlm G-1-weight G-2a-newlm G-2a-weight G-2b-newlm G-2b-weight G-2c... features-rspecifier"
+        " lattice-wspecifier  words-wspecifier \n"
+        "Notably, we always make G-1-weight = -1\n"
+        "ctc example: /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/README\n"
+        "hmm example: /fgfs/users/zhc00/works/dyn_dec/kaldi_minilibri/README\n"
+        ;
     ParseOptions po(usage);
     Timer timer;
     bool allow_partial = false;

From 534755b8390dd86cf3b7dabe0f8cf1ca9ce14757 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Wed, 23 May 2018 10:49:27 +0800
Subject: [PATCH 81/93] bug fix

---
 src/lm/faster-arpa-lm.h | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 7af7e798016..ab701872103 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -44,6 +44,7 @@ uint64  RandInt64() {
 #define HASH_REDUNDANT 0.5
 class FasterArpaLm {
  public:
+  typedef fst::StdArc::StateId StateId;
 
   // LmState in FasterArpaLm: the basic storage unit
   class LmState {
@@ -157,7 +158,7 @@ class FasterArpaLm {
     }
     return hashed_idx;
   }
-  inline void InsertHash(int32 hashed_idx, int32 ngrams_saved_num_) {
+  inline void InsertHash(int64 hashed_idx, int64 ngrams_saved_num_) {
     assert(hashed_idx < ngrams_map_.size());
     if (ngrams_map_.at(hashed_idx)) {
       LmState *lm_state = ngrams_map_[hashed_idx];
@@ -176,7 +177,7 @@ class FasterArpaLm {
   inline void SaveHashedState(const int32* word_ids, 
       int query_ngram_order, LmState &lm_state_pattern) {
     RAND_TYPE h_value=0;
-    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    int64 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     lm_state_pattern.h_value = h_value;
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
@@ -204,7 +205,7 @@ class FasterArpaLm {
     for (int i=0; i<ngrams_num_; i++) {
       int32 *word_ids = ngrams_[i].word_ids_;
       int32 ngram_order = ngrams_[i].ngram_order_;
-      int32 lm_state_idx;
+      StateId lm_state_idx;
       if (ngram_order > ngram_order_-1) {
         ngram_order--;
         while(!GetHashedState(word_ids, ngram_order, &lm_state_idx)) ngram_order--;
@@ -215,10 +216,10 @@ class FasterArpaLm {
   }
 
   inline const LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order, int32 *lm_state_idx=NULL) const {
+      int query_ngram_order, StateId *lm_state_idx=NULL) const {
     RAND_TYPE h_value;
     LmState *ret_lm_state = NULL;
-    int32 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
+    int64 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
     int32 ngram_order = query_ngram_order;
     if (ngram_order == 1) {
       ret_lm_state = &ngrams_[hashed_idx];
@@ -252,14 +253,14 @@ class FasterArpaLm {
   // if exist, get logprob_, else get backoff_logprob_
   // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
   inline void GetWordIdsByLmStateIdx(int32 **word_ids, 
-      int32 *word_ngram_order, int32 lm_state_idx) const {
+      int32 *word_ngram_order, int64 lm_state_idx) const {
     *word_ids = ngrams_[lm_state_idx].word_ids_;
     *word_ngram_order = ngrams_[lm_state_idx].ngram_order_;
   }
 
   inline float GetNgramLogprob(const int32 *word_ids, 
       const int32 word_ngram_order, 
-      int32 *lm_state_idx) const {
+      StateId *lm_state_idx) const {
     float prob;
     int32 ngram_order = word_ngram_order;
     assert(ngram_order > 0);
@@ -329,9 +330,9 @@ class FasterArpaLm {
     ngram_order_ = ngram_count.size();
     srand(0);
     randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
-    ngrams_hashed_size_ = (int32*)malloc(ngram_order_ * sizeof(int32));
-    int32 acc=0;
-    int32 acc_hashed=0;
+    ngrams_hashed_size_ = (int64*)malloc(ngram_order_ * sizeof(int64));
+    int64 acc=0;
+    int64 acc_hashed=0;
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
@@ -396,14 +397,14 @@ class FasterArpaLm {
 
   // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
   LmState* ngrams_;
-  int32 ngrams_saved_num_;
-  int32 ngrams_num_;
+  int64 ngrams_saved_num_;
+  int64 ngrams_num_;
 
   std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
   // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
   RAND_TYPE** randint_per_word_gram_;
-  int32* ngrams_hashed_size_; //after init, it's an accumulate value
-  int32 hash_size_except_uni_;
+  int64* ngrams_hashed_size_; //after init, it's an accumulate value
+  int64 hash_size_except_uni_;
   int32 max_collision_;
 };
 
@@ -436,13 +437,13 @@ class FasterArpaLmDeterministicFst
   // not const.
   virtual Weight Final(StateId s) {
     // At this point, we should have created the state.
-    int32 lm_state_idx;
+    StateId lm_state_idx;
     float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
     return Weight(-logprob);
   }
 
-  float GetNgramLogprob(const int32 pre_lm_state_idx, int32 ilabel,
-      int32 *lm_state_idx) {
+  float GetNgramLogprob(const int64 pre_lm_state_idx, int32 ilabel,
+      StateId *lm_state_idx) {
     int32 *wseq;
     int32 wseq_order;
     lm_.GetWordIdsByLmStateIdx(&wseq, &wseq_order, pre_lm_state_idx);
@@ -461,7 +462,7 @@ class FasterArpaLmDeterministicFst
   virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
     // At this point, we should have created the state.
 
-    int32 lm_state_idx;
+    StateId lm_state_idx;
     float logprob = GetNgramLogprob(s, ilabel, &lm_state_idx);
     if (logprob == std::numeric_limits<float>::min()) {
       return false;

From 0f293bc43de7051ee730eae32c869558f8505f27 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Wed, 23 May 2018 11:10:48 +0800
Subject: [PATCH 82/93] bug fix

---
 src/lm/faster-arpa-lm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index ab701872103..1f03b3a90b9 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -336,7 +336,7 @@ class FasterArpaLm {
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+        ngrams_hashed_size_[i] = (1<<(int64)ceil(log(ngram_count[i]) / 
                                  M_LN2 + HASH_REDUNDANT));
       }
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 

From e9505ea646f88080cfc668677b1223f5ef8efc4d Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Wed, 23 May 2018 11:28:49 +0800
Subject: [PATCH 83/93] bug fix

---
 src/lm/faster-arpa-lm.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 1f03b3a90b9..c4528a85b8e 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -336,11 +336,13 @@ class FasterArpaLm {
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int64)ceil(log(ngram_count[i]) / 
+        ngrams_hashed_size_[i] = ((int64)1<<(int64)ceil(log(ngram_count[i]) / 
                                  M_LN2 + HASH_REDUNDANT));
       }
+      assert(ngram_count[i] >= 0);
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
+      assert(ngrams_hashed_size_[i] >= 0);
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = RandInt64(); 
@@ -358,6 +360,7 @@ class FasterArpaLm {
     ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
     ngrams_num_ = acc;
     ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
+    assert(hash_size_except_uni_ >= 0);
     ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
   }

From 2e67fd66f9be4fea5b1ab306c7527be2e6fc25b3 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Wed, 23 May 2018 11:10:48 +0800
Subject: [PATCH 84/93] bug fix

---
 src/lm/faster-arpa-lm.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index ab701872103..c4528a85b8e 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -336,11 +336,13 @@ class FasterArpaLm {
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+        ngrams_hashed_size_[i] = ((int64)1<<(int64)ceil(log(ngram_count[i]) / 
                                  M_LN2 + HASH_REDUNDANT));
       }
+      assert(ngram_count[i] >= 0);
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
+      assert(ngrams_hashed_size_[i] >= 0);
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = RandInt64(); 
@@ -358,6 +360,7 @@ class FasterArpaLm {
     ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
     ngrams_num_ = acc;
     ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
+    assert(hash_size_except_uni_ >= 0);
     ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
   }

From d49106927d7d7ad0173d8d58ec393871fe7d89b4 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Wed, 23 May 2018 11:10:48 +0800
Subject: [PATCH 85/93] bug fix

---
 src/lm/faster-arpa-lm.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index ab701872103..84c2f9bdfca 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -137,7 +137,7 @@ class FasterArpaLm {
   int32 UnkSymbol() const { return unk_symbol_; }
   int32 NgramOrder() const { return ngram_order_; }
 
-  inline int32 GetHashedIdx(const int32* word_ids, 
+  inline int64 GetHashedIdx(const int32* word_ids, 
       int query_ngram_order, RAND_TYPE *h_value=NULL) const {
     assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
     int32 ngram_order = query_ngram_order;
@@ -336,11 +336,13 @@ class FasterArpaLm {
     for (int i=0; i< ngram_order_; i++) {
       if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
       else {
-        ngrams_hashed_size_[i] = (1<<(int)ceil(log(ngram_count[i]) / 
+        ngrams_hashed_size_[i] = ((int64)1<<(int64)ceil(log(ngram_count[i]) / 
                                  M_LN2 + HASH_REDUNDANT));
       }
+      assert(ngram_count[i] >= 0);
       KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
         1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
+      assert(ngrams_hashed_size_[i] >= 0);
       randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
       for (int j=0; j<symbol_size_; j++) {
         randint_per_word_gram_[i][j] = RandInt64(); 
@@ -358,6 +360,7 @@ class FasterArpaLm {
     ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
     ngrams_num_ = acc;
     ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
+    assert(hash_size_except_uni_ >= 0);
     ngrams_map_.resize(hash_size_except_uni_, NULL);
     is_built_ = true;
   }

From 718f015b44e3a6fdd7befdb929338c9883a0a54b Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 24 May 2018 11:33:23 +0800
Subject: [PATCH 86/93] para decoder fail threads>1 see
 /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/fail.para.sh

---
 src/bin/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 439353b06eb..05c126f5d19 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped latgen-fasterlm-faster-mapped-parallel
 
 OBJFILES =
 

From b118785a83ec4b21a579da1adfd071cb2f049b19 Mon Sep 17 00:00:00 2001
From: chenzhehuai <chenzhehuai@foxmail.com>
Date: Thu, 24 May 2018 12:26:06 +0800
Subject: [PATCH 87/93] tiny

---
 .../latgen-fasterlm-faster-mapped-parallel.cc | 429 ++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 src/bin/latgen-fasterlm-faster-mapped-parallel.cc

diff --git a/src/bin/latgen-fasterlm-faster-mapped-parallel.cc b/src/bin/latgen-fasterlm-faster-mapped-parallel.cc
new file mode 100644
index 00000000000..e35fcf32246
--- /dev/null
+++ b/src/bin/latgen-fasterlm-faster-mapped-parallel.cc
@@ -0,0 +1,429 @@
+// bin/latgen-fasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "util/kaldi-thread.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+
+/// This class basically does the same job as the function
+/// DecodeUtteranceLatticeFaster, but in a way that allows us
+/// to build a multi-threaded command line program more easily.
+/// The main computation takes place in operator (), and the output
+/// happens in the destructor.
+class DecodeUtteranceLatticeBiglmFasterClass {
+ public:
+  // Initializer sets various variables.
+  // NOTE: we "take ownership" of "decoder" and "decodable".  These
+  // are deleted by the destructor.  On error, "num_err" is incremented.
+  DecodeUtteranceLatticeBiglmFasterClass(
+      LatticeBiglmFasterDecoder *decoder,
+      DecodableInterface *decodable,
+      const TransitionModel &trans_model,
+      const fst::SymbolTable *word_syms,
+      std::string utt,
+      BaseFloat acoustic_scale,
+      bool determinize,
+      bool allow_partial,
+      Int32VectorWriter *alignments_writer,
+      Int32VectorWriter *words_writer,
+      CompactLatticeWriter *compact_lattice_writer,
+      LatticeWriter *lattice_writer,
+      double *like_sum, // on success, adds likelihood to this.
+      int64 *frame_sum, // on success, adds #frames to this.
+      int32 *num_done, // on success (including partial decode), increments this.
+      int32 *num_err,  // on failure, increments this.
+      int32 *num_partial) :  // If partial decode (final-state not reached), increments this.
+    decoder_(decoder), decodable_(decodable), trans_model_(&trans_model),
+    word_syms_(word_syms), utt_(utt), acoustic_scale_(acoustic_scale),
+    determinize_(determinize), allow_partial_(allow_partial),
+    alignments_writer_(alignments_writer),
+    words_writer_(words_writer),
+    compact_lattice_writer_(compact_lattice_writer),
+    lattice_writer_(lattice_writer),
+    like_sum_(like_sum), frame_sum_(frame_sum),
+    num_done_(num_done), num_err_(num_err),
+    num_partial_(num_partial),
+    computed_(false), success_(false), partial_(false),
+    clat_(NULL), lat_(NULL) { }
+  
+  void operator () () {// The decoding happens here. 
+  computed_ = true; // Just means this function was called-- a check on the
+  // calling code.
+  success_ = true;
+  using fst::VectorFst;
+  if (!decoder_->Decode(decodable_)) {
+    KALDI_WARN << "Failed to decode file " << utt_;
+    success_ = false;
+  }
+  if (!decoder_->ReachedFinal()) {
+    if (allow_partial_) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt_
+                 << " since no final-state reached\n";
+      partial_ = true;
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt_
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      success_ = false;
+    }
+  }
+  if (!success_) return;
+
+  // Get lattice, and do determinization if requested.
+  lat_ = new Lattice;
+  decoder_->GetRawLattice(lat_);
+  if (lat_->NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt_;
+  fst::Connect(lat_);
+  if (determinize_) {
+    clat_ = new CompactLattice;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            *trans_model_,
+            lat_,
+            decoder_->GetOptions().lattice_beam,
+            clat_,
+            decoder_->GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt_;
+    delete lat_;
+    lat_ = NULL;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale_ != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale_), clat_);
+  } else {
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale_ != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale_), lat_);
+  }
+  }
+  ~DecodeUtteranceLatticeBiglmFasterClass() { // Output happens here.
+  if (!computed_)
+    KALDI_ERR << "Destructor called without operator (), error in calling code.";
+
+  if (!success_) {
+    if (num_err_ != NULL) (*num_err_)++;
+  } else { // successful decode.
+    // Getting the one-best output is lightweight enough that we can do it in
+    // the destructor (easier than adding more variables to the class, and
+    // will rarely slow down the main thread.)
+    double likelihood;
+    LatticeWeight weight = LatticeWeight::Zero();
+    int32 num_frames;
+    { // First do some stuff with word-level traceback...
+      // This is basically for diagnostics.
+      fst::VectorFst<LatticeArc> decoded;
+      decoder_->GetBestPath(&decoded);
+      if (decoded.NumStates() == 0) {
+        // Shouldn't really reach this point as already checked success.
+        KALDI_ERR << "Failed to get traceback for utterance " << utt_;
+      }
+      std::vector<int32> alignment;
+      std::vector<int32> words;
+      GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+      num_frames = alignment.size();
+      if (words_writer_->IsOpen())
+        words_writer_->Write(utt_, words);
+      if (alignments_writer_ && alignments_writer_->IsOpen())
+        alignments_writer_->Write(utt_, alignment);
+      if (word_syms_ != NULL) {
+        std::cerr << utt_ << ' ';
+        for (size_t i = 0; i < words.size(); i++) {
+          std::string s = word_syms_->Find(words[i]);
+          if (s == "")
+            KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+          std::cerr << s << ' ';
+        }
+        std::cerr << '\n';
+      }
+      likelihood = -(weight.Value1() + weight.Value2());
+    }
+
+    // Ouptut the lattices.
+    if (determinize_) { // CompactLattice output.
+      KALDI_ASSERT(compact_lattice_writer_ != NULL && clat_ != NULL);
+      if (clat_->NumStates() == 0) {
+        KALDI_WARN << "Empty lattice for utterance " << utt_;
+      } else {
+        compact_lattice_writer_->Write(utt_, *clat_);
+      }
+      delete clat_;
+      clat_ = NULL;
+    } else {
+      KALDI_ASSERT(lattice_writer_ != NULL && lat_ != NULL);
+      if (lat_->NumStates() == 0) {
+        KALDI_WARN << "Empty lattice for utterance " << utt_;
+      } else {
+        lattice_writer_->Write(utt_, *lat_);
+      }
+      delete lat_;
+      lat_ = NULL;
+    }
+
+    // Print out logging information.
+    KALDI_LOG << "Log-like per frame for utterance " << utt_ << " is "
+              << (likelihood / num_frames) << " over "
+              << num_frames << " frames.";
+    KALDI_VLOG(2) << "Cost for utterance " << utt_ << " is "
+                  << weight.Value1() << " + " << weight.Value2();
+
+    // Now output the various diagnostic variables.
+    if (like_sum_ != NULL) *like_sum_ += likelihood;
+    if (frame_sum_ != NULL) *frame_sum_ += num_frames;
+    if (num_done_ != NULL) (*num_done_)++;
+    if (partial_ && num_partial_ != NULL) (*num_partial_)++;
+  }
+  // We were given ownership of these two objects that were passed in in
+  // the initializer.
+  delete decoder_;
+  delete decodable_;
+
+  }
+ private:
+  // The following variables correspond to inputs:
+  LatticeBiglmFasterDecoder *decoder_;
+  DecodableInterface *decodable_;
+  const TransitionModel *trans_model_;
+  const fst::SymbolTable *word_syms_;
+  std::string utt_;
+  BaseFloat acoustic_scale_;
+  bool determinize_;
+  bool allow_partial_;
+  Int32VectorWriter *alignments_writer_;
+  Int32VectorWriter *words_writer_;
+  CompactLatticeWriter *compact_lattice_writer_;
+  LatticeWriter *lattice_writer_;
+  double *like_sum_;
+  int64 *frame_sum_;
+  int32 *num_done_;
+  int32 *num_err_;
+  int32 *num_partial_;
+
+  // The following variables are stored by the computation.
+  bool computed_; // operator ()  was called.
+  bool success_; // decoding succeeded (possibly partial)
+  bool partial_; // decoding was partial.
+  CompactLattice *clat_; // Stored output, if determinize_ == true.
+  Lattice *lat_; // Stored output, if determinize_ == false.
+};
+
+
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "e.g. HCLG_1 - G_1 + (G_2a \\dynamic_int G_2b) \n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-fasterlm-faster-mapped [options] model-in(for ctc, the model is ignored) HCLG-1-fstin "
+        "G-1-oldlm G-1-weight G-2a-newlm G-2a-weight G-2b-newlm G-2b-weight G-2c... features-rspecifier"
+        " lattice-wspecifier  words-wspecifier \n"
+        "Notably, we always make G-1-weight = -1\n"
+        "ctc example: /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/README\n"
+        "hmm example: /fgfs/users/zhc00/works/dyn_dec/kaldi_minilibri/README\n"
+        ;
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0;
+    bool ctc = false;
+    LatticeBiglmFasterDecoderConfig config;
+    TaskSequencerConfig sequencer_config; // has --num-threads option
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    sequencer_config.Register(&po);
+    po.Register("ctc", &ctc, "is ctc decoding");
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 ) {
+      po.PrintUsage();
+      exit(1);
+    }
+   
+    int start_lm = 3;
+    int end_lm = po.NumArgs() - 3;
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
+        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
+        words_wspecifier = po.GetOptArg(po.NumArgs());
+ 
+    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
+    //old_lm_fst_rxfilename = po.GetArg(3),
+    //new_lm_fst_rxfilename = po.GetArg(4),   
+
+    TransitionModel trans_model;
+    if (!ctc)
+        ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+    int lm_num=(end_lm-start_lm+1)/2;
+    std::vector<FasterArpaLm> lm_vec;
+    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
+    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    lm_vec.reserve(lm_num);
+    dlm_vec.reserve(lm_num);
+    clm_vec.reserve(lm_num-1);
+    for ( int i = start_lm; i < end_lm; i+=2 ) {
+      std::string s_lm = po.GetArg(i);
+      float w =  atof(po.GetArg(i+1).c_str());
+      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
+      dlm_vec.emplace_back(lm_vec.back());
+      if (i == start_lm) continue;
+      else if (i == start_lm+2) {
+        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
+      } else {
+        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
+      }
+    }
+    // multiple compose
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e7);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    //Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    double elapsed=0;
+    TaskSequencer<DecodeUtteranceLatticeBiglmFasterClass> sequencer(sequencer_config);
+    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader loglike_reader(feature_rspecifier);
+      timer.Reset();
+      {
+        for (; !loglike_reader.Done(); loglike_reader.Next()) {
+          std::string utt = loglike_reader.Key();
+          Matrix<BaseFloat> *loglikes =
+            new Matrix<BaseFloat>(loglike_reader.Value());
+          loglike_reader.FreeCurrent();
+          if (loglikes->NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            delete loglikes;
+            continue;
+          }
+
+          LatticeBiglmFasterDecoder* decoder = new LatticeBiglmFasterDecoder(
+                  *decode_fst, config, &cache_dfst);
+          DecodableInterface* decodable = NULL;
+          if (!ctc) 
+            decodable = new DecodableMatrixScaledMapped(trans_model, *loglikes, acoustic_scale);
+          else {
+            decodable = new DecodableMatrixScaledMappedCtc(*loglikes, acoustic_scale);
+            decoder->GetOptions().det_opts.phone_determinize = false; // disable DeterminizeLatticePhonePrunedFirstPass
+          }
+          DecodeUtteranceLatticeBiglmFasterClass *task =
+              new DecodeUtteranceLatticeBiglmFasterClass(
+                  decoder, decodable, trans_model, word_syms, utt,
+                  acoustic_scale, determinize, allow_partial, NULL,
+                  &words_writer, &compact_lattice_writer, &lattice_writer,
+                  &tot_like, &frame_count, &num_success, &num_fail, NULL);
+
+          sequencer.Run(task); // takes ownership of "task",
+          // and will delete it when done.
+        }
+      }
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+    sequencer.Wait();
+    elapsed = timer.Elapsed();
+    delete decode_fst;
+      
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}

From f63d5da7dccb73a03e55f06422408abd4b0c926d Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c01.clsp.jhu.edu>
Date: Mon, 24 Dec 2018 01:29:33 -0500
Subject: [PATCH 88/93] init 0 3.5 init 1 5; TODO: use vector/cache fst

---
 src/bin/Makefile                          |   2 +-
 src/bin/latgen-cfasterlm-faster-mapped.cc | 320 ++++++++++++++++++++++
 src/bin/latgen-fasterlm-faster-mapped.cc  |   2 +-
 src/fstext/deterministic-fst-inl.h        |  89 ++++++
 src/fstext/deterministic-fst.h            |  29 ++
 src/lm/faster-arpa-lm.h                   |   5 +-
 6 files changed, 443 insertions(+), 4 deletions(-)
 create mode 100644 src/bin/latgen-cfasterlm-faster-mapped.cc

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 439353b06eb..7f03cea8dfb 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -23,7 +23,7 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         vector-sum matrix-sum-rows est-pca sum-lda-accs sum-mllt-accs \
         transform-vec align-text matrix-dim post-to-smat
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped
+BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-cfasterlm-faster-mapped llatgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped
 
 OBJFILES =
 
diff --git a/src/bin/latgen-cfasterlm-faster-mapped.cc b/src/bin/latgen-cfasterlm-faster-mapped.cc
new file mode 100644
index 00000000000..4abb2a3ef30
--- /dev/null
+++ b/src/bin/latgen-cfasterlm-faster-mapped.cc
@@ -0,0 +1,320 @@
+// bin/latgen-cfasterlm-faster-mapped .cc
+
+// Copyright      2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "tree/context-dep.h"
+#include "hmm/transition-model.h"
+#include "fstext/fstext-lib.h"
+#include "decoder/decoder-wrappers.h"
+#include "decoder/decodable-matrix.h"
+#include "base/timer.h"
+#include "lm/faster-arpa-lm.h"
+#include "decoder/lattice-biglm-faster-decoder.h"
+
+
+namespace kaldi {
+// Takes care of output.  Returns true on success.
+bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
+                     DecodableInterface &decodable, // not const but is really an input.
+                     const TransitionModel &trans_model,
+                     const fst::SymbolTable *word_syms,
+                     std::string utt,
+                     double acoustic_scale,
+                     bool determinize,
+                     bool allow_partial,
+                     Int32VectorWriter *alignment_writer,
+                     Int32VectorWriter *words_writer,
+                     CompactLatticeWriter *compact_lattice_writer,
+                     LatticeWriter *lattice_writer,
+                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
+  using fst::VectorFst;
+
+  if (!decoder.Decode(&decodable)) {
+    KALDI_WARN << "Failed to decode file " << utt;
+    return false;
+  }
+  if (!decoder.ReachedFinal()) {
+    if (allow_partial) {
+      KALDI_WARN << "Outputting partial output for utterance " << utt
+                 << " since no final-state reached\n";
+    } else {
+      KALDI_WARN << "Not producing output for utterance " << utt
+                 << " since no final-state reached and "
+                 << "--allow-partial=false.\n";
+      return false;
+    }
+  }
+
+  double likelihood;
+  LatticeWeight weight;
+  int32 num_frames;
+  { // First do some stuff with word-level traceback...
+    VectorFst<LatticeArc> decoded;
+    decoder.GetBestPath(&decoded);
+    if (decoded.NumStates() == 0)
+      // Shouldn't really reach this point as already checked success.
+      KALDI_ERR << "Failed to get traceback for utterance " << utt;
+
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    num_frames = alignment.size();
+    if (words_writer->IsOpen())
+      words_writer->Write(utt, words);
+    assert(!alignment_writer);
+    //if (alignment_writer->IsOpen())
+    //  alignment_writer->Write(utt, alignment);
+    if (word_syms != NULL) {
+      std::cerr << utt << ' ';
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+        std::cerr << s << ' ';
+      }
+      std::cerr << '\n';
+    }
+    likelihood = -(weight.Value1() + weight.Value2());
+  }
+
+  // Get lattice, and do determinization if requested.
+  Lattice lat;
+  decoder.GetRawLattice(&lat);
+  if (lat.NumStates() == 0)
+    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
+  fst::Connect(&lat);
+  if (determinize) {
+    CompactLattice clat;
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model,
+            &lat,
+            decoder.GetOptions().lattice_beam,
+            &clat,
+            decoder.GetOptions().det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << utt;
+    // We'll write the lattice without acoustic scaling.
+    if (acoustic_scale != 0.0)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
+    compact_lattice_writer->Write(utt, clat);
+  } else {
+    Lattice fst;
+    decoder.GetRawLattice(&fst);
+    if (fst.NumStates() == 0)
+      KALDI_ERR << "Unexpected problem getting lattice for utterance "
+                << utt;
+    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
+    // disconnected states there, but we seem to.
+    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
+    lattice_writer->Write(utt, fst);
+  }
+  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
+            << (likelihood / num_frames) << " over "
+            << num_frames << " frames.";
+  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
+                << weight.Value1() << " + " << weight.Value2();
+  *like_ptr = likelihood;
+  return true;
+}
+
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::VectorFst;
+    using fst::Fst;
+    using fst::StdArc;
+    using fst::ReadFstKaldi;
+
+    const char *usage =
+        "Generate lattices using on-the-fly composition.\n"
+        "User supplies LM used to generate decoding graph, and desired LM;\n"
+        "this decoder applies the difference during decoding\n"
+        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
+        "oldlm-fst-in newlm-fst-in features-rspecifier"
+        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
+    ParseOptions po(usage);
+    Timer timer;
+    bool allow_partial = false;
+    BaseFloat acoustic_scale = 0.1;
+    int32 symbol_size = 0, init_mode=0;
+    LatticeBiglmFasterDecoderConfig config;
+    std::string init_fst_str = "";
+    config.Register(&po);
+
+    ArpaParseOptions arpa_options;
+    arpa_options.Register(&po);
+    po.Register("symbol-size", &symbol_size, "symbol table size");
+    po.Register("unk-symbol", &arpa_options.unk_symbol,
+                "Integer corresponds to unknown-word in language model. -1 if "
+                "no such word is provided.");
+    po.Register("bos-symbol", &arpa_options.bos_symbol,
+                "Integer corresponds to <s>. You must set this to your actual "
+                "BOS integer.");
+    po.Register("eos-symbol", &arpa_options.eos_symbol,
+                "Integer corresponds to </s>. You must set this to your actual "
+                "EOS integer.");
+
+
+    std::string word_syms_filename;
+    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
+
+    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    po.Register("init-mode", &init_mode, "TODO.");
+    po.Register("init-fst", &init_fst_str, "TODO.");
+    
+    po.Read(argc, argv);
+
+    if (po.NumArgs() < 6 ) {
+      po.PrintUsage();
+      exit(1);
+    }
+   
+    int start_lm = 3;
+    int end_lm = po.NumArgs() - 3;
+    std::string model_in_filename = po.GetArg(1),
+        fst_in_str = po.GetArg(2),
+        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
+        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
+        words_wspecifier = po.GetOptArg(po.NumArgs());
+ 
+    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
+    //old_lm_fst_rxfilename = po.GetArg(3),
+    //new_lm_fst_rxfilename = po.GetArg(4),   
+
+    TransitionModel trans_model;
+    ReadKaldiObject(model_in_filename, &trans_model);
+
+    /*
+    FasterArpaLm old_lm;
+    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
+    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
+    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
+    */
+    int lm_num=(end_lm-start_lm+1)/2;
+    std::vector<FasterArpaLm> lm_vec;
+    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
+    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
+    lm_vec.reserve(lm_num);
+    dlm_vec.reserve(lm_num);
+    clm_vec.reserve(lm_num-1);
+    for ( int i = start_lm; i < end_lm; i+=2 ) {
+      std::string s_lm = po.GetArg(i);
+      float w =  atof(po.GetArg(i+1).c_str());
+      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
+      dlm_vec.emplace_back(lm_vec.back());
+      if (i == start_lm) continue;
+      else if (i == start_lm+2) {
+        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
+      } else {
+        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
+      }
+    }
+    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
+    Fst<StdArc> *init_fst = decode_fst;
+    if (init_fst_str != "") {
+      init_fst = fst::ReadFstKaldiGeneric(init_fst_str);
+    }
+    fst::PreinitDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, init_mode, init_fst);
+
+    bool determinize = config.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    Int32VectorWriter words_writer(words_wspecifier);
+
+    //Int32VectorWriter alignment_writer(alignment_wspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "") 
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+    double tot_like = 0.0;
+    kaldi::int64 frame_count = 0;
+    int num_success = 0, num_fail = 0;
+
+    double elapsed=0;
+    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
+      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+      // Input FST is just one FST, not a table of FSTs.
+
+      {
+        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
+        timer.Reset();
+    
+        for (; !feature_reader.Done(); feature_reader.Next()) {
+          std::string utt = feature_reader.Key();
+          Matrix<BaseFloat> features (feature_reader.Value());
+          feature_reader.FreeCurrent();
+          if (features.NumRows() == 0) {
+            KALDI_WARN << "Zero-length utterance: " << utt;
+            num_fail++;
+            continue;
+          }
+                
+          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
+
+          double like;
+          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
+                              utt, acoustic_scale, determinize, allow_partial,
+                              NULL, &words_writer,
+                              &compact_lattice_writer, &lattice_writer,
+                              &like)) {
+            tot_like += like;
+            frame_count += features.NumRows();
+            num_success++;
+          } else num_fail++;
+        }
+        elapsed = timer.Elapsed();
+      }
+      if (init_fst != decode_fst) delete init_fst;
+      delete decode_fst; // delete this only after decoder goes out of scope.
+    } else { // We have different FSTs for different utterances.
+      assert(0);
+    }
+      
+    KALDI_LOG << "Time taken "<< elapsed
+              << "s: real-time factor assuming 100 frames/sec is "
+              << (elapsed*100.0/frame_count);
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
+              << frame_count<<" frames.";
+
+    delete word_syms;
+    if (num_success != 0) return 0;
+    else return 1;
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index acb74d556ca..bcd3d41d3bb 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -231,7 +231,7 @@ int main(int argc, char *argv[]) {
       }
     }
     // multiple compose
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e7);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index c6f99697e00..9bb432f279c 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -206,6 +206,95 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   return true;
 }
 
+template<class Arc>
+inline size_t  PreinitDeterministicOnDemandFst<Arc>::GetIndex(
+    StateId src_state, Label ilabel) {
+  const StateId p1 = 26597, p2 = 50329; // these are two
+  // values that I drew at random from a table of primes.
+  // note: num_cached_arcs_ > 0.
+
+  // We cast to size_t before the modulus, to ensure the
+  // result is positive.
+  return static_cast<size_t>(src_state * p1 + ilabel * p2) %
+      static_cast<size_t>(num_cached_arcs_);
+}
+
+template<class Arc>
+ PreinitDeterministicOnDemandFst<Arc>::PreinitDeterministicOnDemandFst(
+    DeterministicOnDemandFst<Arc> *fst,
+    StateId num_cached_arcs, int32 init_mode, Fst<Arc>* pat_fst): fst_(fst),
+                              num_cached_arcs_(num_cached_arcs),
+                              cached_arcs_(num_cached_arcs), num_cached_arcs_used_(0) {
+  KALDI_ASSERT(num_cached_arcs > 0);
+  for (StateId i = 0; i < num_cached_arcs; i++)
+    cached_arcs_[i].first = kNoStateId; // Invalidate all elements of the cache.
+
+  if (init_mode == 1) {
+#define MAX_LEV 20
+      std::unordered_map<StateId, std::pair<StateId, int32>> lev_map;
+      std::queue<StateId> q;
+      int cache_arcs = 0;
+      q.push(pat_fst->Start());
+      lev_map[pat_fst->Start()]=std::pair<StateId, int32>(fst_->Start(), 0);
+      while(!q.empty()) {
+        StateId st = q.front();
+        q.pop();
+        int32 level = lev_map[st].second;
+        StateId st_det = lev_map[st].first;
+        // TODO: rewrite this for
+        for (fst::ArcIterator<fst::Fst<Arc> > aiter(*pat_fst, st);
+            !aiter.Done();
+            aiter.Next()) {
+          const Arc &arc = aiter.Value();
+          if (arc.ilabel == 0) {
+            lev_map[arc.nextstate] = std::pair<StateId, int32>(st_det, level);
+            q.push(arc.nextstate);
+            continue;
+          }
+          Arc oarc;
+          KALDI_VLOG(8)<<st_det << " " << arc.ilabel << oarc.nextstate;
+          if (!GetArc(st_det, arc.ilabel, &oarc)) continue;
+          cache_arcs++;
+          auto ret = lev_map.find(arc.nextstate);
+          if (ret == lev_map.end() && level < MAX_LEV) {
+            lev_map[arc.nextstate] = std::pair<StateId, int32>(oarc.nextstate, level + 1);
+            q.push(arc.nextstate);
+          }
+        }
+      }
+     KALDI_VLOG(0) << "preInit state num: " << lev_map.size() << " " << MAX_LEV << " cache: "<< 1.0*num_cached_arcs_used_/num_cached_arcs_ << "% " << 1.0*cache_arcs / num_cached_arcs_used_;
+  }
+}
+
+template<class Arc>
+bool  PreinitDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
+                                                Arc *oarc) {
+  // Note: we don't cache anything in case a requested arc does not exist.
+  // In the uses that we imagine this will be put to, essentially all the
+  // requested arcs will exist.  This only affects efficiency.
+  KALDI_ASSERT(s >= 0 && ilabel != 0);
+  size_t index = this->GetIndex(s, ilabel);
+  if (cached_arcs_[index].first == s &&
+      cached_arcs_[index].second.ilabel == ilabel) {
+    *oarc = cached_arcs_[index].second;
+    return true;
+  } else {
+    Arc arc;
+    if (fst_->GetArc(s, ilabel, &arc)) {
+      if (cached_arcs_[index].first == kNoStateId) {
+        cached_arcs_[index].first = s;
+        cached_arcs_[index].second = arc;
+        num_cached_arcs_used_++;
+      }
+      *oarc = arc;
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
+
+
 template<class Arc>
 inline size_t CacheDeterministicOnDemandFst<Arc>::GetIndex(
     StateId src_state, Label ilabel) {
diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h
index 407e115d3e6..33bd14a162a 100644
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@@ -226,6 +226,35 @@ class ComposeDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
   StateId start_state_;
 };
 
+template<class Arc>
+class PreinitDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
+ public:
+  typedef typename Arc::StateId StateId;
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::Label Label;
+
+  /// We don't take ownership of this pointer.  The argument is "really" const.
+  PreinitDeterministicOnDemandFst(DeterministicOnDemandFst<Arc> *fst,
+                                StateId num_layer = 1, int init_mode=0, Fst<Arc>* pat_fst=NULL);
+
+  virtual StateId Start() { return fst_->Start(); }
+
+  /// We don't bother caching the final-probs, just the arcs.
+  virtual Weight Final(StateId s) { return fst_->Final(s); }
+
+  virtual bool GetArc(StateId s, Label ilabel, Arc *oarc);
+
+ private:
+  // Get index for cached arc.
+  inline size_t GetIndex(StateId src_state, Label ilabel);
+
+  DeterministicOnDemandFst<Arc> *fst_;
+  StateId num_cached_arcs_;
+  std::vector<std::pair<StateId, Arc> > cached_arcs_;
+  StateId num_cached_arcs_used_;
+};
+
+
 template<class Arc>
 class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
  public:
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 7af7e798016..883d5347820 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -275,7 +275,7 @@ class FasterArpaLm {
 
     const LmState *lm_state = GetHashedState(word_ids, ngram_order, lm_state_idx);
     if (lm_state) { //found out
-      assert(lm_state->IsExist());
+      if (!lm_state->IsExist()) return std::numeric_limits<float>::min();
       //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
       prob = lm_state->logprob_;
      
@@ -438,6 +438,7 @@ class FasterArpaLmDeterministicFst
     // At this point, we should have created the state.
     int32 lm_state_idx;
     float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
+    if (logprob <= Weight::Zero().Value()) logprob =  Weight::Zero().Value();
     return Weight(-logprob);
   }
 
@@ -447,7 +448,7 @@ class FasterArpaLmDeterministicFst
     int32 wseq_order;
     lm_.GetWordIdsByLmStateIdx(&wseq, &wseq_order, pre_lm_state_idx);
     int32 n = wseq_order;
-    assert(n>0);
+    if (n<=0) return std::numeric_limits<float>::min();
     int32 word_ids[MAX_NGRAM];
     assert(n+1 <= MAX_NGRAM);
 

From 9f601e47043df8acb48896a361b750a28b08a6af Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c01.clsp.jhu.edu>
Date: Mon, 24 Dec 2018 03:53:34 -0500
Subject: [PATCH 89/93] use unordered_map for preinit
 zchen@c01:/export/a12/zchen/works/decoder/egs/mini_librispeech/s5_otf$ bash 
 run.fasterlm.5d.b.sh --stage 3

speed no change; need better hash/vectorfst and better preinit(use
    statistics)
previous exps in  run.fasterlm.5d.c.sh
---
 src/bin/latgen-fasterlm-faster-mapped.cc |  5 ++--
 src/fstext/deterministic-fst-inl.h       | 36 +++++++++++-------------
 src/fstext/deterministic-fst.h           |  6 ++--
 src/lm/faster-arpa-lm.h                  |  1 -
 4 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index bcd3d41d3bb..01b1c1d2b62 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -159,7 +159,7 @@ int main(int argc, char *argv[]) {
     Timer timer;
     bool allow_partial = false;
     BaseFloat acoustic_scale = 0.1;
-    int32 symbol_size = 0;
+    int32 symbol_size = 0, init_mode=0;
     LatticeBiglmFasterDecoderConfig config;
     config.Register(&po);
 
@@ -182,6 +182,7 @@ int main(int argc, char *argv[]) {
 
     po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
     po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
+    po.Register("init-mode", &init_mode, "TODO.");
     
     po.Read(argc, argv);
 
@@ -231,7 +232,7 @@ int main(int argc, char *argv[]) {
       }
     }
     // multiple compose
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, init_mode==0);
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index 9bb432f279c..871a96bae6f 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -223,11 +223,9 @@ template<class Arc>
  PreinitDeterministicOnDemandFst<Arc>::PreinitDeterministicOnDemandFst(
     DeterministicOnDemandFst<Arc> *fst,
     StateId num_cached_arcs, int32 init_mode, Fst<Arc>* pat_fst): fst_(fst),
-                              num_cached_arcs_(num_cached_arcs),
-                              cached_arcs_(num_cached_arcs), num_cached_arcs_used_(0) {
+                              num_cached_arcs_(num_cached_arcs), num_cached_arcs_used_(0) {
   KALDI_ASSERT(num_cached_arcs > 0);
-  for (StateId i = 0; i < num_cached_arcs; i++)
-    cached_arcs_[i].first = kNoStateId; // Invalidate all elements of the cache.
+  cached_arcs_.reserve(num_cached_arcs_);
 
   if (init_mode == 1) {
 #define MAX_LEV 20
@@ -269,21 +267,14 @@ template<class Arc>
 template<class Arc>
 bool  PreinitDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
                                                 Arc *oarc) {
-  // Note: we don't cache anything in case a requested arc does not exist.
-  // In the uses that we imagine this will be put to, essentially all the
-  // requested arcs will exist.  This only affects efficiency.
   KALDI_ASSERT(s >= 0 && ilabel != 0);
-  size_t index = this->GetIndex(s, ilabel);
-  if (cached_arcs_[index].first == s &&
-      cached_arcs_[index].second.ilabel == ilabel) {
-    *oarc = cached_arcs_[index].second;
-    return true;
-  } else {
+  index_type idx(s, ilabel);
+  auto ret = cached_arcs_.find(idx);
+  if (ret == cached_arcs_.end()) {
     Arc arc;
     if (fst_->GetArc(s, ilabel, &arc)) {
-      if (cached_arcs_[index].first == kNoStateId) {
-        cached_arcs_[index].first = s;
-        cached_arcs_[index].second = arc;
+      if (num_cached_arcs_used_<num_cached_arcs_) {
+        cached_arcs_[idx]=arc; 
         num_cached_arcs_used_++;
       }
       *oarc = arc;
@@ -291,6 +282,9 @@ bool  PreinitDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
     } else {
       return false;
     }
+  } else {
+    *oarc = ret->second;
+    return true;
   }
 }
 
@@ -311,9 +305,9 @@ inline size_t CacheDeterministicOnDemandFst<Arc>::GetIndex(
 template<class Arc>
 CacheDeterministicOnDemandFst<Arc>::CacheDeterministicOnDemandFst(
     DeterministicOnDemandFst<Arc> *fst,
-    StateId num_cached_arcs): fst_(fst),
+    StateId num_cached_arcs, bool overwrite): fst_(fst),
                               num_cached_arcs_(num_cached_arcs),
-                              cached_arcs_(num_cached_arcs) {
+                              cached_arcs_(num_cached_arcs), overwrite_(overwrite) {
   KALDI_ASSERT(num_cached_arcs > 0);
   for (StateId i = 0; i < num_cached_arcs; i++)
     cached_arcs_[i].first = kNoStateId; // Invalidate all elements of the cache.
@@ -334,8 +328,10 @@ bool CacheDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   } else {
     Arc arc;
     if (fst_->GetArc(s, ilabel, &arc)) {
-      cached_arcs_[index].first = s;
-      cached_arcs_[index].second = arc;
+      if (overwrite_ || cached_arcs_[index].first == kNoStateId) {
+        cached_arcs_[index].first = s;
+        cached_arcs_[index].second = arc;
+      }
       *oarc = arc;
       return true;
     } else {
diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h
index 33bd14a162a..5fc857fa009 100644
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@@ -232,6 +232,7 @@ class PreinitDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
   typedef typename Arc::StateId StateId;
   typedef typename Arc::Weight Weight;
   typedef typename Arc::Label Label;
+  typedef std::pair<StateId, StateId> index_type;
 
   /// We don't take ownership of this pointer.  The argument is "really" const.
   PreinitDeterministicOnDemandFst(DeterministicOnDemandFst<Arc> *fst,
@@ -250,7 +251,7 @@ class PreinitDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
 
   DeterministicOnDemandFst<Arc> *fst_;
   StateId num_cached_arcs_;
-  std::vector<std::pair<StateId, Arc> > cached_arcs_;
+  std::unordered_map<index_type, Arc, kaldi::PairHasher<StateId>> cached_arcs_;
   StateId num_cached_arcs_used_;
 };
 
@@ -264,7 +265,7 @@ class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
 
   /// We don't take ownership of this pointer.  The argument is "really" const.
   CacheDeterministicOnDemandFst(DeterministicOnDemandFst<Arc> *fst,
-                                StateId num_cached_arcs = 100000);
+                                StateId num_cached_arcs = 100000, bool overwrite = true);
 
   virtual StateId Start() { return fst_->Start(); }
 
@@ -280,6 +281,7 @@ class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
   DeterministicOnDemandFst<Arc> *fst_;
   StateId num_cached_arcs_;
   std::vector<std::pair<StateId, Arc> > cached_arcs_;
+  bool overwrite_;
 };
 
 
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
index 883d5347820..f3de95d605f 100644
--- a/src/lm/faster-arpa-lm.h
+++ b/src/lm/faster-arpa-lm.h
@@ -438,7 +438,6 @@ class FasterArpaLmDeterministicFst
     // At this point, we should have created the state.
     int32 lm_state_idx;
     float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
-    if (logprob <= Weight::Zero().Value()) logprob =  Weight::Zero().Value();
     return Weight(-logprob);
   }
 

From 558a19f4a21c50c590ab42be715ed045484ba3bc Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@c01.clsp.jhu.edu>
Date: Mon, 24 Dec 2018 21:54:29 -0500
Subject: [PATCH 90/93] tiny

---
 src/bin/latgen-fasterlm-faster-mapped.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
index e8faa569280..b19c8e0fbc5 100644
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ b/src/bin/latgen-fasterlm-faster-mapped.cc
@@ -240,7 +240,7 @@ int main(int argc, char *argv[]) {
       }
     }
     // multiple compose
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, init_mode==0);
+    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, (init_mode==0));
 
     bool determinize = config.determinize_lattice;
     CompactLatticeWriter compact_lattice_writer;

From e26347fc27e6f77f162884d8792bd4d3aa34f80f Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <zchen@login.clsp.jhu.edu>
Date: Tue, 19 Feb 2019 22:53:06 -0500
Subject: [PATCH 91/93] with kaldi-decoder

---
 src/decoder/lattice-faster-decoder.cc | 2 +-
 src/fstext/kaldi-fst-io.cc            | 2 ++
 tools/Makefile                        | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 161f9bf228a..52b724d8370 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -870,7 +870,7 @@ void LatticeFasterDecoder::ProcessNonemittingWrapper(BaseFloat cost_cutoff) {
   } else if (fst_.Type() == "vector") {
     return LatticeFasterDecoder::ProcessNonemitting<fst::VectorFst<Arc>>(cost_cutoff);
   } else {
-    return LatticeFasterDecoder::ProcessNonemitting<fst::ConstFst<Arc>>(cost_cutoff);
+    return LatticeFasterDecoder::ProcessNonemitting<fst::Fst<Arc>>(cost_cutoff);
   }
 }
 
diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc
index cda146104d0..5d114b6b133 100644
--- a/src/fstext/kaldi-fst-io.cc
+++ b/src/fstext/kaldi-fst-io.cc
@@ -76,6 +76,8 @@ Fst<StdArc> *ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err) {
     fst = ConstFst<StdArc>::Read(ki.Stream(), ropts);
   } else if (hdr.FstType() == "vector") {
     fst = VectorFst<StdArc>::Read(ki.Stream(), ropts);
+  } else {
+    fst = Fst<StdArc>::Read(ki.Stream(), ropts);
   }
   if (!fst) {
     if(throw_on_err) {
diff --git a/tools/Makefile b/tools/Makefile
index 478a7ae47f5..c9aa40db2cb 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -10,7 +10,7 @@ CC = gcc         # used for sph2pipe
 OPENFST_VERSION ?= 1.6.5
 
 # Default features configured for OpenFST; can be overridden in the make command line.
-OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
+OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts 
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")

From 23648c6510b9e3d3e1a12e2bff5f4ea47b91cfed Mon Sep 17 00:00:00 2001
From: Zhehuai Chen <chenzhehuai.sjtu@aispeech.com>
Date: Tue, 4 Jun 2019 08:22:28 +0800
Subject: [PATCH 92/93] Revert "Merge branch 'dynamic-decoder.3a' into master"

This reverts commit 4622a951dc10ac6b48b49a4475a750d66ec194e3, reversing
changes made to 6f366c1003b145b34d4cbe66150aef2386a8c8d6.
---
 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh   |  65 --
 src/bin/Makefile                              |   1 -
 src/bin/latgen-biglm-faster-mapped.cc         | 280 -----
 src/bin/latgen-cfasterlm-faster-mapped.cc     | 320 ------
 src/bin/latgen-constlm-faster-mapped.cc       | 280 -----
 .../latgen-fasterlm-faster-mapped-parallel.cc | 429 --------
 src/bin/latgen-fasterlm-faster-mapped.cc      | 329 ------
 .../latgen-otfres-fasterlm-faster-mapped.cc   | 304 ------
 src/decoder/decodable-matrix.h                |  45 -
 src/decoder/lattice-biglm-faster-decoder.h    | 103 +-
 src/decoder/lattice-faster-decoder.cc         |   4 +-
 .../lattice-otfres-biglm-faster-decoder.h     | 957 ------------------
 src/fstext/deterministic-fst-inl.h            |  93 +-
 src/fstext/deterministic-fst.h                |  33 +-
 src/fstext/kaldi-fst-io.cc                    |   2 -
 src/latbin/Makefile                           |   2 -
 src/latbin/lattice-lmrescore-fasterlm-arpa.cc | 135 ---
 src/lm/Makefile                               |   4 +-
 src/lm/faster-arpa-lm-test.cc                 | 126 ---
 src/lm/faster-arpa-lm.cc                      |  36 -
 src/lm/faster-arpa-lm.h                       | 492 ---------
 tools/Makefile                                |   2 +-
 22 files changed, 48 insertions(+), 3994 deletions(-)
 delete mode 100644 egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
 delete mode 100644 src/bin/latgen-biglm-faster-mapped.cc
 delete mode 100644 src/bin/latgen-cfasterlm-faster-mapped.cc
 delete mode 100644 src/bin/latgen-constlm-faster-mapped.cc
 delete mode 100644 src/bin/latgen-fasterlm-faster-mapped-parallel.cc
 delete mode 100644 src/bin/latgen-fasterlm-faster-mapped.cc
 delete mode 100644 src/bin/latgen-otfres-fasterlm-faster-mapped.cc
 delete mode 100644 src/decoder/lattice-otfres-biglm-faster-decoder.h
 delete mode 100644 src/latbin/lattice-lmrescore-fasterlm-arpa.cc
 delete mode 100644 src/lm/faster-arpa-lm-test.cc
 delete mode 100644 src/lm/faster-arpa-lm.cc
 delete mode 100644 src/lm/faster-arpa-lm.h

diff --git a/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh b/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
deleted file mode 100644
index 941cd90664d..00000000000
--- a/egs/wsj/s5/steps/lmrescore_fasterlm_arpa.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-
-# Copyright 2014  Guoguo Chen
-# Apache 2.0
-
-# This script rescores lattices with the ConstArpaLm format language model.
-
-# Begin configuration section.
-cmd=run.pl
-skip_scoring=false
-stage=1
-scoring_opts=
-# End configuration section.
-
-echo "$0 $@"  # Print the command line for logging
-
-. ./utils/parse_options.sh
-
-if [ $# != 5 ]; then
-   echo "Does language model rescoring of lattices (remove old LM, add new LM)"
-   echo "Usage: $0 [options] <old-lang-dir> <new-lang-dir> \\"
-   echo "                   <data-dir> <input-decode-dir> <output-decode-dir>"
-   echo "options: [--cmd (run.pl|queue.pl [queue opts])]"
-   exit 1;
-fi
-
-[ -f path.sh ] && . ./path.sh;
-
-oldlang=$1
-newlang=$2
-data=$3
-indir=$4
-outdir=$5
-
-oldlm=$oldlang/G.fst
-#newlm=$newlang/G.carpa
-newlm="gunzip -c $newlang| utils/map_arpa_lm.pl $oldlang/words.txt |"
-[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1;
-! ls $indir/lat.*.gz >/dev/null &&\
-  echo "$0: No lattices input directory $indir" && exit 1;
-
-
-oldlmcommand="fstproject --project_output=true $oldlm |"
-
-mkdir -p $outdir/log
-nj=`cat $indir/num_jobs` || exit 1;
-cp $indir/num_jobs $outdir
-
-if [ $stage -le 1 ]; then
-  $cmd JOB=1:$nj $outdir/log/rescorelm.JOB.log \
-    lattice-lmrescore --lm-scale=-1.0 \
-    "ark:gunzip -c $indir/lat.JOB.gz|" "$oldlmcommand" ark:-  \| \
-    lattice-lmrescore-fasterlm-arpa --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3   --lm-scale=1.0 ark:- "$newlm" \
-     "ark,t:|gzip -c>$outdir/lat.JOB.gz" || exit 1;
-fi
-
-if ! $skip_scoring && [ $stage -le 2 ]; then
-  err_msg="Not scoring because local/score.sh does not exist or not executable."
-  [ ! -x local/score.sh ] && echo $err_msg && exit 1;
-  local/score.sh --cmd "$cmd" $scoring_opts $data $oldlang $outdir
-else
-  echo "Not scoring because requested so..."
-fi
-
-exit 0;
diff --git a/src/bin/Makefile b/src/bin/Makefile
index ab2e1f912a2..7cb01b50120 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -24,7 +24,6 @@ BINFILES = align-equal align-equal-compiled acc-tree-stats \
         transform-vec align-text matrix-dim post-to-smat compile-graph \
         compare-int-vector
 
-BINFILES += latgen-biglm-faster-mapped latgen-constlm-faster-mapped latgen-cfasterlm-faster-mapped llatgen-fasterlm-faster-mapped latgen-otfres-fasterlm-faster-mapped latgen-fasterlm-faster-mapped-parallel
 
 OBJFILES =
 
diff --git a/src/bin/latgen-biglm-faster-mapped.cc b/src/bin/latgen-biglm-faster-mapped.cc
deleted file mode 100644
index 1f87572a4f3..00000000000
--- a/src/bin/latgen-biglm-faster-mapped.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-// bin/latgen-biglm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-// Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
-                     DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
-                     const fst::SymbolTable *word_syms,
-                     std::string utt,
-                     double acoustic_scale,
-                     bool determinize,
-                     bool allow_partial,
-                     Int32VectorWriter *alignment_writer,
-                     Int32VectorWriter *words_writer,
-                     CompactLatticeWriter *compact_lattice_writer,
-                     LatticeWriter *lattice_writer,
-                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
-  using fst::VectorFst;
-
-  if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
-    return false;
-  }
-  if (!decoder.ReachedFinal()) {
-    if (allow_partial) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt
-                 << " since no final-state reached\n";
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      return false;
-    }
-  }
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  { // First do some stuff with word-level traceback...
-    VectorFst<LatticeArc> decoded;
-    decoder.GetBestPath(&decoded);
-    if (decoded.NumStates() == 0)
-      // Shouldn't really reach this point as already checked success.
-      KALDI_ERR << "Failed to get traceback for utterance " << utt;
-
-    std::vector<int32> alignment;
-    std::vector<int32> words;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-    num_frames = alignment.size();
-    if (words_writer->IsOpen())
-      words_writer->Write(utt, words);
-    if (alignment_writer->IsOpen())
-      alignment_writer->Write(utt, alignment);
-    if (word_syms != NULL) {
-      std::cerr << utt << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        std::string s = word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
-        std::cerr << s << ' ';
-      }
-      std::cerr << '\n';
-    }
-    likelihood = -(weight.Value1() + weight.Value2());
-  }
-
-  // Get lattice, and do determinization if requested.
-  Lattice lat;
-  decoder.GetRawLattice(&lat);
-  if (lat.NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
-  fst::Connect(&lat);
-  if (determinize) {
-    CompactLattice clat;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            trans_model,
-            &lat,
-            decoder.GetOptions().lattice_beam,
-            &clat,
-            decoder.GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
-    compact_lattice_writer->Write(utt, clat);
-  } else {
-    Lattice fst;
-    decoder.GetRawLattice(&fst);
-    if (fst.NumStates() == 0)
-      KALDI_ERR << "Unexpected problem getting lattice for utterance "
-                << utt;
-    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
-    // disconnected states there, but we seem to.
-    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
-    lattice_writer->Write(utt, fst);
-  }
-  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-            << (likelihood / num_frames) << " over "
-            << num_frames << " frames.";
-  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
-                << weight.Value1() << " + " << weight.Value2();
-  *like_ptr = likelihood;
-  return true;
-}
-
-}
-
-
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeBiglmFasterDecoderConfig config;
-    
-    std::string word_syms_filename;
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        old_lm_fst_rxfilename = po.GetArg(3),
-        new_lm_fst_rxfilename = po.GetArg(4),
-        feature_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-    
-    TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
-
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
-    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
-    
-    VectorFst<StdArc> *new_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(new_lm_fst_rxfilename));
-
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-    fst::BackoffDeterministicOnDemandFst<StdArc> new_lm_dfst(*new_lm_fst);
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-
-      {
-        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
-        timer.Reset();
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
-
-          double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
-                              utt, acoustic_scale, determinize, allow_partial,
-                              &alignment_writer, &words_writer,
-                              &compact_lattice_writer, &lattice_writer,
-                              &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-      
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/bin/latgen-cfasterlm-faster-mapped.cc b/src/bin/latgen-cfasterlm-faster-mapped.cc
deleted file mode 100644
index 4abb2a3ef30..00000000000
--- a/src/bin/latgen-cfasterlm-faster-mapped.cc
+++ /dev/null
@@ -1,320 +0,0 @@
-// bin/latgen-cfasterlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "lm/faster-arpa-lm.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-// Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
-                     DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
-                     const fst::SymbolTable *word_syms,
-                     std::string utt,
-                     double acoustic_scale,
-                     bool determinize,
-                     bool allow_partial,
-                     Int32VectorWriter *alignment_writer,
-                     Int32VectorWriter *words_writer,
-                     CompactLatticeWriter *compact_lattice_writer,
-                     LatticeWriter *lattice_writer,
-                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
-  using fst::VectorFst;
-
-  if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
-    return false;
-  }
-  if (!decoder.ReachedFinal()) {
-    if (allow_partial) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt
-                 << " since no final-state reached\n";
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      return false;
-    }
-  }
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  { // First do some stuff with word-level traceback...
-    VectorFst<LatticeArc> decoded;
-    decoder.GetBestPath(&decoded);
-    if (decoded.NumStates() == 0)
-      // Shouldn't really reach this point as already checked success.
-      KALDI_ERR << "Failed to get traceback for utterance " << utt;
-
-    std::vector<int32> alignment;
-    std::vector<int32> words;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-    num_frames = alignment.size();
-    if (words_writer->IsOpen())
-      words_writer->Write(utt, words);
-    assert(!alignment_writer);
-    //if (alignment_writer->IsOpen())
-    //  alignment_writer->Write(utt, alignment);
-    if (word_syms != NULL) {
-      std::cerr << utt << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        std::string s = word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
-        std::cerr << s << ' ';
-      }
-      std::cerr << '\n';
-    }
-    likelihood = -(weight.Value1() + weight.Value2());
-  }
-
-  // Get lattice, and do determinization if requested.
-  Lattice lat;
-  decoder.GetRawLattice(&lat);
-  if (lat.NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
-  fst::Connect(&lat);
-  if (determinize) {
-    CompactLattice clat;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            trans_model,
-            &lat,
-            decoder.GetOptions().lattice_beam,
-            &clat,
-            decoder.GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
-    compact_lattice_writer->Write(utt, clat);
-  } else {
-    Lattice fst;
-    decoder.GetRawLattice(&fst);
-    if (fst.NumStates() == 0)
-      KALDI_ERR << "Unexpected problem getting lattice for utterance "
-                << utt;
-    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
-    // disconnected states there, but we seem to.
-    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
-    lattice_writer->Write(utt, fst);
-  }
-  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-            << (likelihood / num_frames) << " over "
-            << num_frames << " frames.";
-  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
-                << weight.Value1() << " + " << weight.Value2();
-  *like_ptr = likelihood;
-  return true;
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    int32 symbol_size = 0, init_mode=0;
-    LatticeBiglmFasterDecoderConfig config;
-    std::string init_fst_str = "";
-    config.Register(&po);
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-
-    std::string word_syms_filename;
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    po.Register("init-mode", &init_mode, "TODO.");
-    po.Register("init-fst", &init_fst_str, "TODO.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 ) {
-      po.PrintUsage();
-      exit(1);
-    }
-   
-    int start_lm = 3;
-    int end_lm = po.NumArgs() - 3;
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
-        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
-        words_wspecifier = po.GetOptArg(po.NumArgs());
- 
-    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
-    //old_lm_fst_rxfilename = po.GetArg(3),
-    //new_lm_fst_rxfilename = po.GetArg(4),   
-
-    TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
-
-    /*
-    FasterArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
-    int lm_num=(end_lm-start_lm+1)/2;
-    std::vector<FasterArpaLm> lm_vec;
-    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
-    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
-    lm_vec.reserve(lm_num);
-    dlm_vec.reserve(lm_num);
-    clm_vec.reserve(lm_num-1);
-    for ( int i = start_lm; i < end_lm; i+=2 ) {
-      std::string s_lm = po.GetArg(i);
-      float w =  atof(po.GetArg(i+1).c_str());
-      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
-      dlm_vec.emplace_back(lm_vec.back());
-      if (i == start_lm) continue;
-      else if (i == start_lm+2) {
-        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
-      } else {
-        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
-      }
-    }
-    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-    Fst<StdArc> *init_fst = decode_fst;
-    if (init_fst_str != "") {
-      init_fst = fst::ReadFstKaldiGeneric(init_fst_str);
-    }
-    fst::PreinitDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, init_mode, init_fst);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    //Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    double elapsed=0;
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-
-      {
-        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
-        timer.Reset();
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
-
-          double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
-                              utt, acoustic_scale, determinize, allow_partial,
-                              NULL, &words_writer,
-                              &compact_lattice_writer, &lattice_writer,
-                              &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-        elapsed = timer.Elapsed();
-      }
-      if (init_fst != decode_fst) delete init_fst;
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-      
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/bin/latgen-constlm-faster-mapped.cc b/src/bin/latgen-constlm-faster-mapped.cc
deleted file mode 100644
index 6c83707e642..00000000000
--- a/src/bin/latgen-constlm-faster-mapped.cc
+++ /dev/null
@@ -1,280 +0,0 @@
-// bin/latgen-constlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "lm/const-arpa-lm.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-// Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
-                     DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
-                     const fst::SymbolTable *word_syms,
-                     std::string utt,
-                     double acoustic_scale,
-                     bool determinize,
-                     bool allow_partial,
-                     Int32VectorWriter *alignment_writer,
-                     Int32VectorWriter *words_writer,
-                     CompactLatticeWriter *compact_lattice_writer,
-                     LatticeWriter *lattice_writer,
-                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
-  using fst::VectorFst;
-
-  if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
-    return false;
-  }
-  if (!decoder.ReachedFinal()) {
-    if (allow_partial) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt
-                 << " since no final-state reached\n";
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      return false;
-    }
-  }
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  { // First do some stuff with word-level traceback...
-    VectorFst<LatticeArc> decoded;
-    decoder.GetBestPath(&decoded);
-    if (decoded.NumStates() == 0)
-      // Shouldn't really reach this point as already checked success.
-      KALDI_ERR << "Failed to get traceback for utterance " << utt;
-
-    std::vector<int32> alignment;
-    std::vector<int32> words;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-    num_frames = alignment.size();
-    if (words_writer->IsOpen())
-      words_writer->Write(utt, words);
-    if (alignment_writer->IsOpen())
-      alignment_writer->Write(utt, alignment);
-    if (word_syms != NULL) {
-      std::cerr << utt << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        std::string s = word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
-        std::cerr << s << ' ';
-      }
-      std::cerr << '\n';
-    }
-    likelihood = -(weight.Value1() + weight.Value2());
-  }
-
-  // Get lattice, and do determinization if requested.
-  Lattice lat;
-  decoder.GetRawLattice(&lat);
-  if (lat.NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
-  fst::Connect(&lat);
-  if (determinize) {
-    CompactLattice clat;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            trans_model,
-            &lat,
-            decoder.GetOptions().lattice_beam,
-            &clat,
-            decoder.GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
-    compact_lattice_writer->Write(utt, clat);
-  } else {
-    Lattice fst;
-    decoder.GetRawLattice(&fst);
-    if (fst.NumStates() == 0)
-      KALDI_ERR << "Unexpected problem getting lattice for utterance "
-                << utt;
-    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
-    // disconnected states there, but we seem to.
-    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
-    lattice_writer->Write(utt, fst);
-  }
-  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-            << (likelihood / num_frames) << " over "
-            << num_frames << " frames.";
-  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
-                << weight.Value1() << " + " << weight.Value2();
-  *like_ptr = likelihood;
-  return true;
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    LatticeBiglmFasterDecoderConfig config;
-    
-    std::string word_syms_filename;
-    config.Register(&po);
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        old_lm_fst_rxfilename = po.GetArg(3),
-        new_lm_fst_rxfilename = po.GetArg(4),
-        feature_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-    
-    TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
-
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
-    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-
-    ConstArpaLm new_lm;
-    ReadKaldiObject(new_lm_fst_rxfilename, &new_lm);
-    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
-
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-
-      {
-        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
-        timer.Reset();
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
-
-          double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
-                              utt, acoustic_scale, determinize, allow_partial,
-                              &alignment_writer, &words_writer,
-                              &compact_lattice_writer, &lattice_writer,
-                              &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-      
-    double elapsed = timer.Elapsed();
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/bin/latgen-fasterlm-faster-mapped-parallel.cc b/src/bin/latgen-fasterlm-faster-mapped-parallel.cc
deleted file mode 100644
index e35fcf32246..00000000000
--- a/src/bin/latgen-fasterlm-faster-mapped-parallel.cc
+++ /dev/null
@@ -1,429 +0,0 @@
-// bin/latgen-fasterlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "util/kaldi-thread.h"
-#include "lm/faster-arpa-lm.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-
-/// This class basically does the same job as the function
-/// DecodeUtteranceLatticeFaster, but in a way that allows us
-/// to build a multi-threaded command line program more easily.
-/// The main computation takes place in operator (), and the output
-/// happens in the destructor.
-class DecodeUtteranceLatticeBiglmFasterClass {
- public:
-  // Initializer sets various variables.
-  // NOTE: we "take ownership" of "decoder" and "decodable".  These
-  // are deleted by the destructor.  On error, "num_err" is incremented.
-  DecodeUtteranceLatticeBiglmFasterClass(
-      LatticeBiglmFasterDecoder *decoder,
-      DecodableInterface *decodable,
-      const TransitionModel &trans_model,
-      const fst::SymbolTable *word_syms,
-      std::string utt,
-      BaseFloat acoustic_scale,
-      bool determinize,
-      bool allow_partial,
-      Int32VectorWriter *alignments_writer,
-      Int32VectorWriter *words_writer,
-      CompactLatticeWriter *compact_lattice_writer,
-      LatticeWriter *lattice_writer,
-      double *like_sum, // on success, adds likelihood to this.
-      int64 *frame_sum, // on success, adds #frames to this.
-      int32 *num_done, // on success (including partial decode), increments this.
-      int32 *num_err,  // on failure, increments this.
-      int32 *num_partial) :  // If partial decode (final-state not reached), increments this.
-    decoder_(decoder), decodable_(decodable), trans_model_(&trans_model),
-    word_syms_(word_syms), utt_(utt), acoustic_scale_(acoustic_scale),
-    determinize_(determinize), allow_partial_(allow_partial),
-    alignments_writer_(alignments_writer),
-    words_writer_(words_writer),
-    compact_lattice_writer_(compact_lattice_writer),
-    lattice_writer_(lattice_writer),
-    like_sum_(like_sum), frame_sum_(frame_sum),
-    num_done_(num_done), num_err_(num_err),
-    num_partial_(num_partial),
-    computed_(false), success_(false), partial_(false),
-    clat_(NULL), lat_(NULL) { }
-  
-  void operator () () {// The decoding happens here. 
-  computed_ = true; // Just means this function was called-- a check on the
-  // calling code.
-  success_ = true;
-  using fst::VectorFst;
-  if (!decoder_->Decode(decodable_)) {
-    KALDI_WARN << "Failed to decode file " << utt_;
-    success_ = false;
-  }
-  if (!decoder_->ReachedFinal()) {
-    if (allow_partial_) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt_
-                 << " since no final-state reached\n";
-      partial_ = true;
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt_
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      success_ = false;
-    }
-  }
-  if (!success_) return;
-
-  // Get lattice, and do determinization if requested.
-  lat_ = new Lattice;
-  decoder_->GetRawLattice(lat_);
-  if (lat_->NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt_;
-  fst::Connect(lat_);
-  if (determinize_) {
-    clat_ = new CompactLattice;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            *trans_model_,
-            lat_,
-            decoder_->GetOptions().lattice_beam,
-            clat_,
-            decoder_->GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt_;
-    delete lat_;
-    lat_ = NULL;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale_ != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale_), clat_);
-  } else {
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale_ != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale_), lat_);
-  }
-  }
-  ~DecodeUtteranceLatticeBiglmFasterClass() { // Output happens here.
-  if (!computed_)
-    KALDI_ERR << "Destructor called without operator (), error in calling code.";
-
-  if (!success_) {
-    if (num_err_ != NULL) (*num_err_)++;
-  } else { // successful decode.
-    // Getting the one-best output is lightweight enough that we can do it in
-    // the destructor (easier than adding more variables to the class, and
-    // will rarely slow down the main thread.)
-    double likelihood;
-    LatticeWeight weight = LatticeWeight::Zero();
-    int32 num_frames;
-    { // First do some stuff with word-level traceback...
-      // This is basically for diagnostics.
-      fst::VectorFst<LatticeArc> decoded;
-      decoder_->GetBestPath(&decoded);
-      if (decoded.NumStates() == 0) {
-        // Shouldn't really reach this point as already checked success.
-        KALDI_ERR << "Failed to get traceback for utterance " << utt_;
-      }
-      std::vector<int32> alignment;
-      std::vector<int32> words;
-      GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-      num_frames = alignment.size();
-      if (words_writer_->IsOpen())
-        words_writer_->Write(utt_, words);
-      if (alignments_writer_ && alignments_writer_->IsOpen())
-        alignments_writer_->Write(utt_, alignment);
-      if (word_syms_ != NULL) {
-        std::cerr << utt_ << ' ';
-        for (size_t i = 0; i < words.size(); i++) {
-          std::string s = word_syms_->Find(words[i]);
-          if (s == "")
-            KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
-          std::cerr << s << ' ';
-        }
-        std::cerr << '\n';
-      }
-      likelihood = -(weight.Value1() + weight.Value2());
-    }
-
-    // Ouptut the lattices.
-    if (determinize_) { // CompactLattice output.
-      KALDI_ASSERT(compact_lattice_writer_ != NULL && clat_ != NULL);
-      if (clat_->NumStates() == 0) {
-        KALDI_WARN << "Empty lattice for utterance " << utt_;
-      } else {
-        compact_lattice_writer_->Write(utt_, *clat_);
-      }
-      delete clat_;
-      clat_ = NULL;
-    } else {
-      KALDI_ASSERT(lattice_writer_ != NULL && lat_ != NULL);
-      if (lat_->NumStates() == 0) {
-        KALDI_WARN << "Empty lattice for utterance " << utt_;
-      } else {
-        lattice_writer_->Write(utt_, *lat_);
-      }
-      delete lat_;
-      lat_ = NULL;
-    }
-
-    // Print out logging information.
-    KALDI_LOG << "Log-like per frame for utterance " << utt_ << " is "
-              << (likelihood / num_frames) << " over "
-              << num_frames << " frames.";
-    KALDI_VLOG(2) << "Cost for utterance " << utt_ << " is "
-                  << weight.Value1() << " + " << weight.Value2();
-
-    // Now output the various diagnostic variables.
-    if (like_sum_ != NULL) *like_sum_ += likelihood;
-    if (frame_sum_ != NULL) *frame_sum_ += num_frames;
-    if (num_done_ != NULL) (*num_done_)++;
-    if (partial_ && num_partial_ != NULL) (*num_partial_)++;
-  }
-  // We were given ownership of these two objects that were passed in in
-  // the initializer.
-  delete decoder_;
-  delete decodable_;
-
-  }
- private:
-  // The following variables correspond to inputs:
-  LatticeBiglmFasterDecoder *decoder_;
-  DecodableInterface *decodable_;
-  const TransitionModel *trans_model_;
-  const fst::SymbolTable *word_syms_;
-  std::string utt_;
-  BaseFloat acoustic_scale_;
-  bool determinize_;
-  bool allow_partial_;
-  Int32VectorWriter *alignments_writer_;
-  Int32VectorWriter *words_writer_;
-  CompactLatticeWriter *compact_lattice_writer_;
-  LatticeWriter *lattice_writer_;
-  double *like_sum_;
-  int64 *frame_sum_;
-  int32 *num_done_;
-  int32 *num_err_;
-  int32 *num_partial_;
-
-  // The following variables are stored by the computation.
-  bool computed_; // operator ()  was called.
-  bool success_; // decoding succeeded (possibly partial)
-  bool partial_; // decoding was partial.
-  CompactLattice *clat_; // Stored output, if determinize_ == true.
-  Lattice *lat_; // Stored output, if determinize_ == false.
-};
-
-
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "e.g. HCLG_1 - G_1 + (G_2a \\dynamic_int G_2b) \n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-fasterlm-faster-mapped [options] model-in(for ctc, the model is ignored) HCLG-1-fstin "
-        "G-1-oldlm G-1-weight G-2a-newlm G-2a-weight G-2b-newlm G-2b-weight G-2c... features-rspecifier"
-        " lattice-wspecifier  words-wspecifier \n"
-        "Notably, we always make G-1-weight = -1\n"
-        "ctc example: /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/README\n"
-        "hmm example: /fgfs/users/zhc00/works/dyn_dec/kaldi_minilibri/README\n"
-        ;
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    int32 symbol_size = 0;
-    bool ctc = false;
-    LatticeBiglmFasterDecoderConfig config;
-    TaskSequencerConfig sequencer_config; // has --num-threads option
-    config.Register(&po);
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    sequencer_config.Register(&po);
-    po.Register("ctc", &ctc, "is ctc decoding");
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-
-    std::string word_syms_filename;
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 ) {
-      po.PrintUsage();
-      exit(1);
-    }
-   
-    int start_lm = 3;
-    int end_lm = po.NumArgs() - 3;
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
-        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
-        words_wspecifier = po.GetOptArg(po.NumArgs());
- 
-    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
-    //old_lm_fst_rxfilename = po.GetArg(3),
-    //new_lm_fst_rxfilename = po.GetArg(4),   
-
-    TransitionModel trans_model;
-    if (!ctc)
-        ReadKaldiObject(model_in_filename, &trans_model);
-
-    /*
-    FasterArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
-    int lm_num=(end_lm-start_lm+1)/2;
-    std::vector<FasterArpaLm> lm_vec;
-    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
-    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
-    lm_vec.reserve(lm_num);
-    dlm_vec.reserve(lm_num);
-    clm_vec.reserve(lm_num-1);
-    for ( int i = start_lm; i < end_lm; i+=2 ) {
-      std::string s_lm = po.GetArg(i);
-      float w =  atof(po.GetArg(i+1).c_str());
-      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
-      dlm_vec.emplace_back(lm_vec.back());
-      if (i == start_lm) continue;
-      else if (i == start_lm+2) {
-        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
-      } else {
-        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
-      }
-    }
-    // multiple compose
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e7);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    //Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    double elapsed=0;
-    TaskSequencer<DecodeUtteranceLatticeBiglmFasterClass> sequencer(sequencer_config);
-    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader loglike_reader(feature_rspecifier);
-      timer.Reset();
-      {
-        for (; !loglike_reader.Done(); loglike_reader.Next()) {
-          std::string utt = loglike_reader.Key();
-          Matrix<BaseFloat> *loglikes =
-            new Matrix<BaseFloat>(loglike_reader.Value());
-          loglike_reader.FreeCurrent();
-          if (loglikes->NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            delete loglikes;
-            continue;
-          }
-
-          LatticeBiglmFasterDecoder* decoder = new LatticeBiglmFasterDecoder(
-                  *decode_fst, config, &cache_dfst);
-          DecodableInterface* decodable = NULL;
-          if (!ctc) 
-            decodable = new DecodableMatrixScaledMapped(trans_model, *loglikes, acoustic_scale);
-          else {
-            decodable = new DecodableMatrixScaledMappedCtc(*loglikes, acoustic_scale);
-            decoder->GetOptions().det_opts.phone_determinize = false; // disable DeterminizeLatticePhonePrunedFirstPass
-          }
-          DecodeUtteranceLatticeBiglmFasterClass *task =
-              new DecodeUtteranceLatticeBiglmFasterClass(
-                  decoder, decodable, trans_model, word_syms, utt,
-                  acoustic_scale, determinize, allow_partial, NULL,
-                  &words_writer, &compact_lattice_writer, &lattice_writer,
-                  &tot_like, &frame_count, &num_success, &num_fail, NULL);
-
-          sequencer.Run(task); // takes ownership of "task",
-          // and will delete it when done.
-        }
-      }
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-    sequencer.Wait();
-    elapsed = timer.Elapsed();
-    delete decode_fst;
-      
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/bin/latgen-fasterlm-faster-mapped.cc b/src/bin/latgen-fasterlm-faster-mapped.cc
deleted file mode 100644
index b19c8e0fbc5..00000000000
--- a/src/bin/latgen-fasterlm-faster-mapped.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-// bin/latgen-fasterlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "lm/faster-arpa-lm.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-// Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
-                     DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
-                     const fst::SymbolTable *word_syms,
-                     std::string utt,
-                     double acoustic_scale,
-                     bool determinize,
-                     bool allow_partial,
-                     Int32VectorWriter *alignment_writer,
-                     Int32VectorWriter *words_writer,
-                     CompactLatticeWriter *compact_lattice_writer,
-                     LatticeWriter *lattice_writer,
-                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
-  using fst::VectorFst;
-
-  if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
-    return false;
-  }
-  if (!decoder.ReachedFinal()) {
-    if (allow_partial) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt
-                 << " since no final-state reached\n";
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      return false;
-    }
-  }
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  { // First do some stuff with word-level traceback...
-    VectorFst<LatticeArc> decoded;
-    decoder.GetBestPath(&decoded);
-    if (decoded.NumStates() == 0)
-      // Shouldn't really reach this point as already checked success.
-      KALDI_ERR << "Failed to get traceback for utterance " << utt;
-
-    std::vector<int32> alignment;
-    std::vector<int32> words;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-    num_frames = alignment.size();
-    if (words_writer->IsOpen())
-      words_writer->Write(utt, words);
-    assert(!alignment_writer);
-    //if (alignment_writer->IsOpen())
-    //  alignment_writer->Write(utt, alignment);
-    if (word_syms != NULL) {
-      std::cerr << utt << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        std::string s = word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
-        std::cerr << s << ' ';
-      }
-      std::cerr << '\n';
-    }
-    likelihood = -(weight.Value1() + weight.Value2());
-  }
-
-  // Get lattice, and do determinization if requested.
-  Lattice lat;
-  decoder.GetRawLattice(&lat);
-  if (lat.NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
-  fst::Connect(&lat);
-  if (determinize) {
-    CompactLattice clat;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            trans_model,
-            &lat,
-            decoder.GetOptions().lattice_beam,
-            &clat,
-            decoder.GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
-    compact_lattice_writer->Write(utt, clat);
-  } else {
-    Lattice fst;
-    decoder.GetRawLattice(&fst);
-    if (fst.NumStates() == 0)
-      KALDI_ERR << "Unexpected problem getting lattice for utterance "
-                << utt;
-    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
-    // disconnected states there, but we seem to.
-    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
-    lattice_writer->Write(utt, fst);
-  }
-  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-            << (likelihood / num_frames) << " over "
-            << num_frames << " frames.";
-  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
-                << weight.Value1() << " + " << weight.Value2();
-  *like_ptr = likelihood;
-  return true;
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "e.g. HCLG_1 - G_1 + (G_2a \\dynamic_int G_2b) \n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-fasterlm-faster-mapped [options] model-in(for ctc, the model is ignored) HCLG-1-fstin "
-        "G-1-oldlm G-1-weight G-2a-newlm G-2a-weight G-2b-newlm G-2b-weight G-2c... features-rspecifier"
-        " lattice-wspecifier  words-wspecifier \n"
-        "Notably, we always make G-1-weight = -1\n"
-        "ctc example: /fgfs/users/zhc00/works/dyn_dec/kaldi_ctc/README\n"
-        "hmm example: /fgfs/users/zhc00/works/dyn_dec/kaldi_minilibri/README\n"
-        ;
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    int32 symbol_size = 0, init_mode=0;
-    bool ctc = false;
-    LatticeBiglmFasterDecoderConfig config;
-    config.Register(&po);
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    po.Register("ctc", &ctc, "is ctc decoding");
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-
-    std::string word_syms_filename;
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    po.Register("init-mode", &init_mode, "TODO.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 ) {
-      po.PrintUsage();
-      exit(1);
-    }
-   
-    int start_lm = 3;
-    int end_lm = po.NumArgs() - 3;
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        feature_rspecifier = po.GetArg(po.NumArgs() - 2),
-        lattice_wspecifier = po.GetArg(po.NumArgs() - 1),
-        words_wspecifier = po.GetOptArg(po.NumArgs());
- 
-    assert((end_lm - start_lm+1) % 2 == 0); // one lm one weight
-    //old_lm_fst_rxfilename = po.GetArg(3),
-    //new_lm_fst_rxfilename = po.GetArg(4),   
-
-    TransitionModel trans_model;
-    if (!ctc)
-        ReadKaldiObject(model_in_filename, &trans_model);
-
-    /*
-    FasterArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
-    int lm_num=(end_lm-start_lm+1)/2;
-    std::vector<FasterArpaLm> lm_vec;
-    std::vector<FasterArpaLmDeterministicFst> dlm_vec;
-    std::vector<fst::ComposeDeterministicOnDemandFst<StdArc>> clm_vec;
-    lm_vec.reserve(lm_num);
-    dlm_vec.reserve(lm_num);
-    clm_vec.reserve(lm_num-1);
-    for ( int i = start_lm; i < end_lm; i+=2 ) {
-      std::string s_lm = po.GetArg(i);
-      float w =  atof(po.GetArg(i+1).c_str());
-      lm_vec.emplace_back(arpa_options, s_lm, symbol_size, w);
-      dlm_vec.emplace_back(lm_vec.back());
-      if (i == start_lm) continue;
-      else if (i == start_lm+2) {
-        clm_vec.emplace_back(&dlm_vec.front(),&dlm_vec.back());
-      } else {
-        clm_vec.emplace_back(&clm_vec.back(),&dlm_vec.back());
-      }
-    }
-    // multiple compose
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&clm_vec.back(), 1e9, (init_mode==0));
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    //Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    double elapsed=0;
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-
-      {
-        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
-        timer.Reset();
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-         
-          DecodableInterface* decodable = NULL;
-          if (!ctc) 
-            decodable = new DecodableMatrixScaledMapped(trans_model, features, acoustic_scale);
-          else {
-            decodable = new DecodableMatrixScaledMappedCtc(features, acoustic_scale);
-            decoder.GetOptions().det_opts.phone_determinize = false; // disable DeterminizeLatticePhonePrunedFirstPass
-          }
-
-          double like;
-          if (DecodeUtterance(decoder, *decodable, trans_model, word_syms,
-                              utt, acoustic_scale, determinize, allow_partial,
-                              NULL, &words_writer,
-                              &compact_lattice_writer, &lattice_writer,
-                              &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-          delete decodable;
-        }
-        elapsed = timer.Elapsed();
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-      
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc b/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
deleted file mode 100644
index 9f37cf8a331..00000000000
--- a/src/bin/latgen-otfres-fasterlm-faster-mapped.cc
+++ /dev/null
@@ -1,304 +0,0 @@
-// bin/latgen-otfres-fasterlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "lm/faster-arpa-lm.h"
-#include "decoder/lattice-otfres-biglm-faster-decoder.h"
-
-
-namespace kaldi {
-// Takes care of output.  Returns true on success.
-bool DecodeUtterance(LatticeBiglmFasterDecoder &decoder, // not const but is really an input.
-                     DecodableInterface &decodable, // not const but is really an input.
-                     const TransitionModel &trans_model,
-                     const fst::SymbolTable *word_syms,
-                     std::string utt,
-                     double acoustic_scale,
-                     bool determinize,
-                     bool allow_partial,
-                     Int32VectorWriter *alignment_writer,
-                     Int32VectorWriter *words_writer,
-                     CompactLatticeWriter *compact_lattice_writer,
-                     LatticeWriter *lattice_writer,
-                     double *like_ptr) {  // puts utterance's like in like_ptr on success.
-  using fst::VectorFst;
-
-  if (!decoder.Decode(&decodable)) {
-    KALDI_WARN << "Failed to decode file " << utt;
-    return false;
-  }
-  if (!decoder.ReachedFinal()) {
-    if (allow_partial) {
-      KALDI_WARN << "Outputting partial output for utterance " << utt
-                 << " since no final-state reached\n";
-    } else {
-      KALDI_WARN << "Not producing output for utterance " << utt
-                 << " since no final-state reached and "
-                 << "--allow-partial=false.\n";
-      return false;
-    }
-  }
-
-  double likelihood;
-  LatticeWeight weight;
-  int32 num_frames;
-  { // First do some stuff with word-level traceback...
-    VectorFst<LatticeArc> decoded;
-    decoder.GetBestPath(&decoded);
-    if (decoded.NumStates() == 0)
-      // Shouldn't really reach this point as already checked success.
-      KALDI_ERR << "Failed to get traceback for utterance " << utt;
-
-    std::vector<int32> alignment;
-    std::vector<int32> words;
-    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
-    num_frames = alignment.size();
-    if (words_writer->IsOpen())
-      words_writer->Write(utt, words);
-    if (alignment_writer->IsOpen())
-      alignment_writer->Write(utt, alignment);
-    if (word_syms != NULL) {
-      std::cerr << utt << ' ';
-      for (size_t i = 0; i < words.size(); i++) {
-        std::string s = word_syms->Find(words[i]);
-        if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
-        std::cerr << s << ' ';
-      }
-      std::cerr << '\n';
-    }
-    likelihood = -(weight.Value1() + weight.Value2());
-  }
-
-  // Get lattice, and do determinization if requested.
-  Lattice lat;
-  decoder.GetRawLattice(&lat);
-  if (lat.NumStates() == 0)
-    KALDI_ERR << "Unexpected problem getting lattice for utterance " << utt;
-  fst::Connect(&lat);
-  if (determinize) {
-    CompactLattice clat;
-    if (!DeterminizeLatticePhonePrunedWrapper(
-            trans_model,
-            &lat,
-            decoder.GetOptions().lattice_beam,
-            &clat,
-            decoder.GetOptions().det_opts))
-      KALDI_WARN << "Determinization finished earlier than the beam for "
-                 << "utterance " << utt;
-    // We'll write the lattice without acoustic scaling.
-    if (acoustic_scale != 0.0)
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &clat);
-    compact_lattice_writer->Write(utt, clat);
-  } else {
-    Lattice fst;
-    decoder.GetRawLattice(&fst);
-    if (fst.NumStates() == 0)
-      KALDI_ERR << "Unexpected problem getting lattice for utterance "
-                << utt;
-    fst::Connect(&fst); // Will get rid of this later... shouldn't have any
-    // disconnected states there, but we seem to.
-    if (acoustic_scale != 0.0) // We'll write the lattice without acoustic scaling
-      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale), &fst); 
-    lattice_writer->Write(utt, fst);
-  }
-  KALDI_LOG << "Log-like per frame for utterance " << utt << " is "
-            << (likelihood / num_frames) << " over "
-            << num_frames << " frames.";
-  KALDI_VLOG(2) << "Cost for utterance " << utt << " is "
-                << weight.Value1() << " + " << weight.Value2();
-  *like_ptr = likelihood;
-  return true;
-}
-
-}
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-    using fst::ReadFstKaldi;
-
-    const char *usage =
-        "Generate lattices using on-the-fly composition.\n"
-        "User supplies LM used to generate decoding graph, and desired LM;\n"
-        "this decoder applies the difference during decoding\n"
-        "Usage: latgen-biglm-faster-mapped [options] model-in (fst-in|fsts-rspecifier) "
-        "oldlm-fst-in newlm-fst-in features-rspecifier"
-        " lattice-wspecifier [ words-wspecifier [alignments-wspecifier] ]\n";
-    ParseOptions po(usage);
-    Timer timer;
-    bool allow_partial = false;
-    BaseFloat acoustic_scale = 0.1;
-    int32 symbol_size = 0;
-    LatticeBiglmFasterDecoderConfig config;
-    config.Register(&po);
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-
-    std::string word_syms_filename;
-    po.Register("acoustic-scale", &acoustic_scale, "Scaling factor for acoustic likelihoods");
-
-    po.Register("word-symbol-table", &word_syms_filename, "Symbol table for words [for debug output]");
-    po.Register("allow-partial", &allow_partial, "If true, produce output even if end state was not reached.");
-    
-    po.Read(argc, argv);
-
-    if (po.NumArgs() < 6 || po.NumArgs() > 8) {
-      po.PrintUsage();
-      exit(1);
-    }
-    
-    std::string model_in_filename = po.GetArg(1),
-        fst_in_str = po.GetArg(2),
-        old_lm_fst_rxfilename = po.GetArg(3),
-        new_lm_fst_rxfilename = po.GetArg(4),
-        feature_rspecifier = po.GetArg(5),
-        lattice_wspecifier = po.GetArg(6),
-        words_wspecifier = po.GetOptArg(7),
-        alignment_wspecifier = po.GetOptArg(8);
-    
-    TransitionModel trans_model;
-    ReadKaldiObject(model_in_filename, &trans_model);
-
-    /*
-    FasterArpaLm old_lm;
-    ReadKaldiObject(old_lm_fst_rxfilename, &old_lm);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-    ApplyProbabilityScale(-1.0, old_lm_dfst); // Negate old LM probs...
-    */
-#if 1
-    FasterArpaLm old_lm(arpa_options, old_lm_fst_rxfilename,  symbol_size, -1);
-    FasterArpaLmDeterministicFst old_lm_dfst(old_lm);
-#else
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(old_lm_fst_rxfilename));
-    ApplyProbabilityScale(-1.0, old_lm_fst); // Negate old LM probs...
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-#endif
-
-    FasterArpaLm new_lm(arpa_options, new_lm_fst_rxfilename, symbol_size);
-    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
-
-    fst::ComposeDeterministicOnDemandFst<StdArc> compose_dfst(&old_lm_dfst,
-                                                              &new_lm_dfst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&compose_dfst, 1e7);
-
-    bool determinize = config.determinize_lattice;
-    CompactLatticeWriter compact_lattice_writer;
-    LatticeWriter lattice_writer;
-    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
-           : lattice_writer.Open(lattice_wspecifier)))
-      KALDI_ERR << "Could not open table for writing lattices: "
-                 << lattice_wspecifier;
-
-    Int32VectorWriter words_writer(words_wspecifier);
-
-    Int32VectorWriter alignment_writer(alignment_wspecifier);
-
-    fst::SymbolTable *word_syms = NULL;
-    if (word_syms_filename != "") 
-      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
-        KALDI_ERR << "Could not read symbol table from file "
-                   << word_syms_filename;
-
-    double tot_like = 0.0;
-    kaldi::int64 frame_count = 0;
-    int num_success = 0, num_fail = 0;
-
-    double elapsed=0;
-    if (ClassifyRspecifier(fst_in_str, NULL, NULL) == kNoRspecifier) {
-      SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
-      // Input FST is just one FST, not a table of FSTs.
-      Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_str);
-
-      {
-        LatticeBiglmFasterDecoder decoder(*decode_fst, config, &cache_dfst);
-        timer.Reset();
-    
-        for (; !feature_reader.Done(); feature_reader.Next()) {
-          std::string utt = feature_reader.Key();
-          Matrix<BaseFloat> features (feature_reader.Value());
-          feature_reader.FreeCurrent();
-          if (features.NumRows() == 0) {
-            KALDI_WARN << "Zero-length utterance: " << utt;
-            num_fail++;
-            continue;
-          }
-                
-          DecodableMatrixScaledMapped decodable(trans_model, features, acoustic_scale);
-
-          double like;
-          if (DecodeUtterance(decoder, decodable, trans_model, word_syms,
-                              utt, acoustic_scale, determinize, allow_partial,
-                              &alignment_writer, &words_writer,
-                              &compact_lattice_writer, &lattice_writer,
-                              &like)) {
-            tot_like += like;
-            frame_count += features.NumRows();
-            num_success++;
-          } else num_fail++;
-        }
-        elapsed = timer.Elapsed();
-      }
-      delete decode_fst; // delete this only after decoder goes out of scope.
-    } else { // We have different FSTs for different utterances.
-      assert(0);
-    }
-      
-    KALDI_LOG << "Time taken "<< elapsed
-              << "s: real-time factor assuming 100 frames/sec is "
-              << (elapsed*100.0/frame_count);
-    KALDI_LOG << "Done " << num_success << " utterances, failed for "
-              << num_fail;
-    KALDI_LOG << "Overall log-likelihood per frame is " << (tot_like/frame_count) << " over "
-              << frame_count<<" frames.";
-
-    delete word_syms;
-    if (num_success != 0) return 0;
-    else return 1;
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index 46ecca1d23d..475638a35af 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -83,51 +83,6 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
 };
 
-class DecodableMatrixScaledMappedCtc: public DecodableInterface {
- public:
-  // This constructor creates an object that will not delete "likes"
-  // when done.
-	DecodableMatrixScaledMappedCtc(const Matrix<BaseFloat> &likes,
-                              BaseFloat scale): likes_(&likes),
-                                                scale_(scale), delete_likes_(false) {
-
-  }
-
-  // This constructor creates an object that will delete "likes"
-  // when done.
-	DecodableMatrixScaledMappedCtc(BaseFloat scale,
-                              const Matrix<BaseFloat> *likes):
-      likes_(likes),
-      scale_(scale), delete_likes_(true) {
-
-  }
-
-  virtual int32 NumFramesReady() const { return likes_->NumRows(); }
-
-  virtual bool IsLastFrame(int32 frame) const {
-    KALDI_ASSERT(frame < NumFramesReady());
-    return (frame == NumFramesReady() - 1);
-  }
-
-  // Note, frames are numbered from zero.
-  virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_ * (*likes_)(frame, tid-1);
-  }
-
-  // Indices are one-based!  This is for compatibility with OpenFst.
-  virtual int32 NumIndices() const { return likes_->NumCols(); }
-
-  virtual ~DecodableMatrixScaledMappedCtc() {
-    if (delete_likes_) delete likes_;
-  }
- private:
-  const Matrix<BaseFloat> *likes_;
-  BaseFloat scale_;
-  bool delete_likes_;
-  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMappedCtc);
-};
-
-
 /**
    This is like DecodableMatrixScaledMapped, but it doesn't support an acoustic
    scale, and it does support a frame offset, whereby you can state that the
diff --git a/src/decoder/lattice-biglm-faster-decoder.h b/src/decoder/lattice-biglm-faster-decoder.h
index 75e8af9cf8c..6276c25a83d 100644
--- a/src/decoder/lattice-biglm-faster-decoder.h
+++ b/src/decoder/lattice-biglm-faster-decoder.h
@@ -66,7 +66,7 @@ class LatticeBiglmFasterDecoder {
     toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
   }
   void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
-  LatticeBiglmFasterDecoderConfig& GetOptions() { return config_; } 
+  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
   ~LatticeBiglmFasterDecoder() {
     DeleteElems(toks_.Clear());    
     ClearActiveTokens();
@@ -615,71 +615,49 @@ class LatticeBiglmFasterDecoder {
   /// Gets the weight cutoff.  Also counts the active tokens.
   BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
                       BaseFloat *adaptive_beam, Elem **best_elem) {
-  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
-  // positive == high cost == bad.
-  size_t count = 0;
-  if (config_.max_active == std::numeric_limits<int32>::max() &&
-      config_.min_active == 0) {
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
+    BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+    // positive == high cost == bad.
+    size_t count = 0;
+    if (config_.max_active == std::numeric_limits<int32>::max()) {
+      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+        BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+        if (w < best_weight) {
+          best_weight = w;
+          if (best_elem) *best_elem = e;
+        }
       }
-    }
-    if (tok_count != NULL) *tok_count = count;
-    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
-    return best_weight + config_.beam;
-  } else {
-    tmp_array_.clear();
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = e->val->tot_cost;
-      tmp_array_.push_back(w);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
+      if (tok_count != NULL) *tok_count = count;
+      if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+      return best_weight + config_.beam;
+    } else {
+      tmp_array_.clear();
+      for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+        BaseFloat w = e->val->tot_cost;
+        tmp_array_.push_back(w);
+        if (w < best_weight) {
+          best_weight = w;
+          if (best_elem) *best_elem = e;
+        }
       }
-    }
-    if (tok_count != NULL) *tok_count = count;
-
-    BaseFloat beam_cutoff = best_weight + config_.beam,
-        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
-        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
-
-    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
-                  << " is " << tmp_array_.size();
-
-    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
-      std::nth_element(tmp_array_.begin(),
-                       tmp_array_.begin() + config_.max_active,
-                       tmp_array_.end());
-      max_active_cutoff = tmp_array_[config_.max_active];
-    }
-    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
-      if (adaptive_beam)
-        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
-      return max_active_cutoff;
-    }
-    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
-      if (config_.min_active == 0) min_active_cutoff = best_weight;
-      else {
+      if (tok_count != NULL) *tok_count = count;
+      if (tmp_array_.size() <= static_cast<size_t>(config_.max_active)) {
+        if (adaptive_beam) *adaptive_beam = config_.beam;
+        return best_weight + config_.beam;
+      } else {
+        // the lowest elements (lowest costs, highest likes)
+        // will be put in the left part of tmp_array.
         std::nth_element(tmp_array_.begin(),
-                         tmp_array_.begin() + config_.min_active,
-                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
-                         tmp_array_.begin() + config_.max_active :
+                         tmp_array_.begin()+config_.max_active,
                          tmp_array_.end());
-        min_active_cutoff = tmp_array_[config_.min_active];
+        // return the tighter of the two beams.
+        BaseFloat ans = std::min(best_weight + config_.beam,
+                                 *(tmp_array_.begin()+config_.max_active));
+        if (adaptive_beam)
+          *adaptive_beam = std::min(config_.beam,
+                                    ans - best_weight + config_.beam_delta);
+        return ans;
       }
     }
-    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
-      if (adaptive_beam)
-        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
-      return min_active_cutoff;
-    } else {
-      *adaptive_beam = config_.beam;
-      return beam_cutoff;
-    }
-  }
   }
 
   inline StateId PropagateLm(StateId lm_state,
@@ -714,10 +692,7 @@ class LatticeBiglmFasterDecoder {
     size_t tok_cnt;
     BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
     PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
-    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
-                << adaptive_beam << "\t" << cur_cutoff;
-
-  
+    
     BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
     // pruning "online" before having seen all tokens
 
diff --git a/src/decoder/lattice-faster-decoder.cc b/src/decoder/lattice-faster-decoder.cc
index 2c8d7ba1f8b..2bc8c7cdef4 100644
--- a/src/decoder/lattice-faster-decoder.cc
+++ b/src/decoder/lattice-faster-decoder.cc
@@ -739,8 +739,8 @@ BaseFloat LatticeFasterDecoderTpl<FST, Token>::ProcessEmitting(
   BaseFloat adaptive_beam;
   size_t tok_cnt;
   BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
-  KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << NumFramesDecoded() << " is "
-                << adaptive_beam << "\t" << cur_cutoff;
+  KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
+                << adaptive_beam;
 
   PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
 
diff --git a/src/decoder/lattice-otfres-biglm-faster-decoder.h b/src/decoder/lattice-otfres-biglm-faster-decoder.h
deleted file mode 100644
index ac682024ccb..00000000000
--- a/src/decoder/lattice-otfres-biglm-faster-decoder.h
+++ /dev/null
@@ -1,957 +0,0 @@
-// decoder/lattice-otfres-biglm-faster-decoder.h
-
-// Copyright 2009-2011  Microsoft Corporation, Mirko Hannemann,
-//              Gilles Boulianne
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
-#define KALDI_DECODER_LATTICE_BIGLM_FASTER_DECODER_H_
-
-
-#include "util/stl-utils.h"
-#include "util/hash-list.h"
-#include "fst/fstlib.h"
-#include "itf/decodable-itf.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "decoder/lattice-faster-decoder.h" // for options.
-
-
-namespace kaldi {
-
-// The options are the same as for lattice-faster-decoder.h for now.
-typedef LatticeFasterDecoderConfig LatticeBiglmFasterDecoderConfig;
-
-/** This is as LatticeFasterDecoder, but does online composition between
-    HCLG and the "difference language model", which is a deterministic
-    FST that represents the difference between the language model you want
-    and the language model you compiled HCLG with.  The class
-    DeterministicOnDemandFst follows through the epsilons in G for you
-    (assuming G is a standard backoff language model) and makes it look
-    like a determinized FST.
-*/
-
-class LatticeBiglmFasterDecoder {
- public:
-  typedef fst::StdArc Arc;
-  typedef Arc::Label Label;
-  typedef Arc::StateId StateId;
-  // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
-  typedef uint64 PairId;
-  typedef Arc::Weight Weight;
-  // instantiate this class once for each thing you have to decode.
-  LatticeBiglmFasterDecoder(
-      const fst::Fst<fst::StdArc> &fst,      
-      const LatticeBiglmFasterDecoderConfig &config,
-      fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst):
-      fst_(fst), lm_diff_fst_(lm_diff_fst), config_(config),
-      warned_noarc_(false), num_toks_(0) {
-    config.Check();
-    KALDI_ASSERT(fst.Start() != fst::kNoStateId &&
-                 lm_diff_fst->Start() != fst::kNoStateId);
-    toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
-    toks_g1.SetSize(1000);  // just so on the first frame we do something reasonable.
-  }
-  void SetOptions(const LatticeBiglmFasterDecoderConfig &config) { config_ = config; } 
-  LatticeBiglmFasterDecoderConfig GetOptions() { return config_; } 
-  ~LatticeBiglmFasterDecoder() {
-    DeleteElems(toks_.Clear());   
-    ClearActiveTokens();
-  }
-
-  // Returns true if any kind of traceback is available (not necessarily from
-  // a final state).
-  bool Decode(DecodableInterface *decodable) {
-    // clean up from last time:
-    DeleteElems(toks_.Clear());
-    ClearActiveTokens();
-    warned_ = false;
-    final_active_ = false;
-    final_costs_.clear();
-    num_toks_ = 0;
-    PairId start_pair = ConstructPair(fst_.Start(), lm_diff_fst_->Start());
-    active_toks_.resize(1);
-    Token *start_tok = new Token(0.0, 0.0, NULL, NULL);
-    active_toks_[0].toks = start_tok;
-    toks_.Insert(start_pair, start_tok);
-    toks_g1.Insert(PairToState(start_pair), start_pair);
-    num_toks_++;
-    ProcessNonemitting(0);
-    
-    // We use 1-based indexing for frames in this decoder (if you view it in
-    // terms of features), but note that the decodable object uses zero-based
-    // numbering, which we have to correct for when we call it.
-    for (int32 frame = 1; !decodable->IsLastFrame(frame-2); frame++) {
-      active_toks_.resize(frame+1); // new column
-
-      ProcessEmitting(decodable, frame);
-      
-      ProcessNonemitting(frame);
-
-      if (decodable->IsLastFrame(frame-1))
-        PruneActiveTokensFinal(frame);
-      else if (frame % config_.prune_interval == 0)
-        PruneActiveTokens(frame, config_.lattice_beam * 0.1); // use larger delta.        
-    }
-    // Returns true if we have any kind of traceback available (not necessarily
-    // to the end state; query ReachedFinal() for that).
-    return !final_costs_.empty();
-  }
-
-  /// says whether a final-state was active on the last frame.  If it was not, the
-  /// lattice (or traceback) will end with states that are not final-states.
-  bool ReachedFinal() const { return final_active_; }
-
-
-  // Outputs an FST corresponding to the single best path
-  // through the lattice.
-  bool GetBestPath(fst::MutableFst<LatticeArc> *ofst, 
-                   bool use_final_probs = true) const {
-    fst::VectorFst<LatticeArc> fst;
-    if (!GetRawLattice(&fst, use_final_probs)) return false;
-    // std::cout << "Raw lattice is:\n";
-    // fst::FstPrinter<LatticeArc> fstprinter(fst, NULL, NULL, NULL, false, true);
-    // fstprinter.Print(&std::cout, "standard output");
-    ShortestPath(fst, ofst);
-    return true;
-  }
-
-  // Outputs an FST corresponding to the raw, state-level
-  // tracebacks.
-  bool GetRawLattice(fst::MutableFst<LatticeArc> *ofst,
-                     bool use_final_probs = true) const {
-    typedef LatticeArc Arc;
-    typedef Arc::StateId StateId;
-    // A PairId will be constructed as: (StateId in fst) + (StateId in lm_diff_fst) << 32;
-    typedef uint64 PairId;
-    typedef Arc::Weight Weight;
-    typedef Arc::Label Label;
-    ofst->DeleteStates();
-    // num-frames plus one (since frames are one-based, and we have
-    // an extra frame for the start-state).
-    int32 num_frames = active_toks_.size() - 1;
-    KALDI_ASSERT(num_frames > 0);
-    unordered_map<Token*, StateId> tok_map(num_toks_/2 + 3); // bucket count
-    // First create all states.
-    for (int32 f = 0; f <= num_frames; f++) {
-      if (active_toks_[f].toks == NULL) {
-        KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
-                   << ": not producing lattice.\n";
-        return false;
-      }
-      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next)
-        tok_map[tok] = ofst->AddState();
-      // The next statement sets the start state of the output FST.
-      // Because we always add new states to the head of the list
-      // active_toks_[f].toks, and the start state was the first one
-      // added, it will be the last one added to ofst.
-      if (f == 0 && ofst->NumStates() > 0)
-        ofst->SetStart(ofst->NumStates()-1);
-    }
-    KALDI_VLOG(3) << "init:" << num_toks_/2 + 3 << " buckets:" 
-                  << tok_map.bucket_count() << " load:" << tok_map.load_factor() 
-                  << " max:" << tok_map.max_load_factor();
-    // Now create all arcs.
-    StateId cur_state = 0; // we rely on the fact that we numbered these
-    // consecutively (AddState() returns the numbers in order..)
-    for (int32 f = 0; f <= num_frames; f++) {
-      for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next,
-               cur_state++) {
-        for (ForwardLink *l = tok->links;
-             l != NULL;
-             l = l->next) {
-          unordered_map<Token*, StateId>::const_iterator iter =
-              tok_map.find(l->next_tok);
-          StateId nextstate = iter->second;
-          KALDI_ASSERT(iter != tok_map.end());
-          Arc arc(l->ilabel, l->olabel,
-                  Weight(l->graph_cost, l->acoustic_cost),
-                  nextstate);
-          ofst->AddArc(cur_state, arc);
-        }
-        if (f == num_frames) {
-          if (use_final_probs && !final_costs_.empty()) {
-            std::map<Token*, BaseFloat>::const_iterator iter =
-                final_costs_.find(tok);
-            if (iter != final_costs_.end())
-              ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
-          } else {
-            ofst->SetFinal(cur_state, LatticeWeight::One());
-          }
-        }
-      }
-    }
-    KALDI_ASSERT(cur_state == ofst->NumStates());
-    return (cur_state != 0);
-  }
-
-  // This function is now deprecated, since now we do determinization from
-  // outside the LatticeBiglmFasterDecoder class.
-  // Outputs an FST corresponding to the lattice-determinized
-  // lattice (one path per word sequence).
-  bool GetLattice(fst::MutableFst<CompactLatticeArc> *ofst,
-                  bool use_final_probs = true) const {
-    Lattice raw_fst;
-    if (!GetRawLattice(&raw_fst, use_final_probs)) return false;
-    Invert(&raw_fst); // make it so word labels are on the input.
-    if (!TopSort(&raw_fst)) // topological sort makes lattice-determinization more efficient
-      KALDI_WARN << "Topological sorting of state-level lattice failed "
-          "(probably your lexicon has empty words or your LM has epsilon cycles; this "
-          " is a bad idea.)";
-    // (in phase where we get backward-costs).
-    fst::ILabelCompare<LatticeArc> ilabel_comp;
-    ArcSort(&raw_fst, ilabel_comp); // sort on ilabel; makes
-    // lattice-determinization more efficient.
-    
-    fst::DeterminizeLatticePrunedOptions lat_opts;
-    lat_opts.max_mem = config_.det_opts.max_mem;
-    
-    DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
-    raw_fst.DeleteStates(); // Free memory-- raw_fst no longer needed.
-    Connect(ofst); // Remove unreachable states... there might be
-    // a small number of these, in some cases.
-    return true;
-  }
-  
- private:
-  inline PairId ConstructPair(StateId fst_state, StateId lm_state) {
-    return static_cast<PairId>(fst_state) + (static_cast<PairId>(lm_state) << 32);
-  }
-  
-  static inline StateId PairToState(PairId state_pair) {
-    return static_cast<StateId>(static_cast<uint32>(state_pair));
-  }
-  static inline StateId PairToLmState(PairId state_pair) {
-    return static_cast<StateId>(static_cast<uint32>(state_pair >> 32));
-  }
-  
-  struct Token;
-  // ForwardLinks are the links from a token to a token on the next frame.
-  // or sometimes on the current frame (for input-epsilon links).
-  struct ForwardLink {
-    Token *next_tok; // the next token [or NULL if represents final-state]
-    Label ilabel; // ilabel on link.
-    Label olabel; // olabel on link.
-    BaseFloat graph_cost; // graph cost of traversing link (contains LM, etc.)
-    BaseFloat acoustic_cost; // acoustic cost (pre-scaled) of traversing link
-    ForwardLink *next; // next in singly-linked list of forward links from a
-                       // token.
-    inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
-                       BaseFloat graph_cost, BaseFloat acoustic_cost, 
-                       ForwardLink *next):
-        next_tok(next_tok), ilabel(ilabel), olabel(olabel),
-        graph_cost(graph_cost), acoustic_cost(acoustic_cost), 
-        next(next) { }
-  };  
-  
-  // Token is what's resident in a particular state at a particular time.
-  // In this decoder a Token actually contains *forward* links.
-  // When first created, a Token just has the (total) cost.    We add forward
-  // links to it when we process the next frame.
-  struct Token {
-    BaseFloat tot_cost; // would equal weight.Value()... cost up to this point.
-    BaseFloat extra_cost; // >= 0.  After calling PruneForwardLinks, this equals
-    // the minimum difference between the cost of the best path, and the cost of
-    // this is on, and the cost of the absolute best path, under the assumption
-    // that any of the currently active states at the decoding front may
-    // eventually succeed (e.g. if you were to take the currently active states
-    // one by one and compute this difference, and then take the minimum).
-    
-    ForwardLink *links; // Head of singly linked list of ForwardLinks
-    
-    Token *next; // Next in list of tokens for this frame.
-    
-    inline Token(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLink *links,
-                 Token *next): tot_cost(tot_cost), extra_cost(extra_cost),
-                 links(links), next(next) { }
-    inline void DeleteForwardLinks() {
-      ForwardLink *l = links, *m; 
-      while (l != NULL) {
-        m = l->next;
-        delete l;
-        l = m;
-      }
-      links = NULL;
-    }
-  };
-  
-  // head and tail of per-frame list of Tokens (list is in topological order),
-  // and something saying whether we ever pruned it using PruneForwardLinks.
-  struct TokenList {
-    Token *toks;
-    bool must_prune_forward_links;
-    bool must_prune_tokens;
-    TokenList(): toks(NULL), must_prune_forward_links(true),
-                 must_prune_tokens(true) { }
-  };
-
-  typedef HashList<PairId, Token*>::Elem Elem;
-  typedef HashList<StateId, BaseFloat>::Elem Elem_g1;
-  
-  void PossiblyResizeHash(size_t num_toks) {
-    size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
-                                        * config_.hash_ratio);
-    if (new_sz > toks_.Size()) {
-      toks_.SetSize(new_sz);
-    }
-    if (new_sz > toks_g1.Size()) {
-      toks_g1.SetSize(new_sz);
-    }
-  }
-
-  // FindOrAddToken either locates a token in hash of toks_,
-  // or if necessary inserts a new, empty token (i.e. with no forward links)
-  // for the current frame.  [note: it's inserted if necessary into hash toks_
-  // and also into the singly linked list of tokens active on this frame
-  // (whose head is at active_toks_[frame]).
-  inline Token *FindOrAddToken_2(PairId state_pair, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed) {
-    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
-    // if the token was newly created or the cost changed.
-    KALDI_ASSERT(frame < active_toks_.size());
-    Token *&toks = active_toks_[frame].toks;
-    Elem *e_found = toks_.Find(state_pair);
-    if (e_found == NULL) { // no such token presently.
-      const BaseFloat extra_cost = 0.0;
-      // tokens on the currently final frame have zero extra_cost
-      // as any of them could end up
-      // on the winning path.
-      Token *new_tok = new Token (tot_cost, extra_cost, NULL, toks);
-      // NULL: no forward links yet
-      toks = new_tok;
-      num_toks_++;
-      toks_.Insert(state_pair, new_tok);
-      if (changed) *changed = true;
-      return new_tok;
-    } else {
-      Token *tok = e_found->val; // There is an existing Token for this state.
-      if (tok->tot_cost > tot_cost) { // replace old token
-        tok->tot_cost = tot_cost;
-        // we don't allocate a new token, the old stays linked in active_toks_
-        // we only replace the tot_cost
-        // in the current frame, there are no forward links (and no extra_cost)
-        // only in ProcessNonemitting we have to delete forward links
-        // in case we visit a state for the second time
-        // those forward links, that lead to this replaced token before:
-        // they remain and will hopefully be pruned later (PruneForwardLinks...)
-        if (changed) *changed = true;
-      } else {
-        if (changed) *changed = false;
-      }
-      return tok;
-    }
-  }
-#define res_beam 0.5
-   inline bool FindOrAddToken(StateId state_id, int32 frame, BaseFloat tot_cost,
-                               bool emitting, bool *changed, bool pp) {
-    // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
-    // if the token was newly created or the cost changed.
-    KALDI_ASSERT(frame < active_toks_.size());
-    Elem_g1 *e_found = toks_g1.Find(state_id);
-    if (e_found == NULL) { // no such token presently.
-      toks_g1.Insert(state_id, tot_cost);
-      return true;
-    } else {
-      if (tot_cost < e_found->val + res_beam) {// There is an existing Token for this state.
-        if (tot_cost < e_found->val)
-          e_found->val = tot_cost;
-        return true;
-      }
-      else if (pp) {
-        return false;
-      }
-      else {
-        return true;
-      }
-    }
-   }
- 
-  // prunes outgoing links for all tokens in active_toks_[frame]
-  // it's called by PruneActiveTokens
-  // all links, that have link_extra_cost > lattice_beam are pruned
-  void PruneForwardLinks(int32 frame, bool *extra_costs_changed,
-                         bool *links_pruned,
-                         BaseFloat delta) {
-    // delta is the amount by which the extra_costs must change
-    // If delta is larger,  we'll tend to go back less far
-    //    toward the beginning of the file.
-    // extra_costs_changed is set to true if extra_cost was changed for any token
-    // links_pruned is set to true if any link in any token was pruned
-
-    *extra_costs_changed = false;
-    *links_pruned = false;
-    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
-    if (active_toks_[frame].toks == NULL ) { // empty list; should not happen.
-      if (!warned_) {
-        KALDI_WARN << "No tokens alive [doing pruning].. warning first "
-            "time only for each utterance\n";
-        warned_ = true;
-      }
-    }
-    
-    // We have to iterate until there is no more change, because the links
-    // are not guaranteed to be in topological order.
-    bool changed = true; // difference new minus old extra cost >= delta ?
-    while (changed) {
-      changed = false;
-      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
-        ForwardLink *link, *prev_link=NULL;
-        // will recompute tok_extra_cost for tok.
-        BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
-        // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
-        for (link = tok->links; link != NULL; ) {
-          // See if we need to excise this link...
-          Token *next_tok = link->next_tok;
-          BaseFloat link_extra_cost = next_tok->extra_cost +
-              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
-               - next_tok->tot_cost); // difference in brackets is >= 0
-          // link_exta_cost is the difference in score between the best paths
-          // through link source state and through link destination state
-          KALDI_ASSERT(link_extra_cost == link_extra_cost); // check for NaN
-          if (link_extra_cost > config_.lattice_beam) { // excise link
-            ForwardLink *next_link = link->next;
-            if (prev_link != NULL) prev_link->next = next_link;
-            else tok->links = next_link;
-            delete link;
-            link = next_link; // advance link but leave prev_link the same.
-            *links_pruned = true;
-          } else { // keep the link and update the tok_extra_cost if needed.
-            if (link_extra_cost < 0.0) { // this is just a precaution.
-              if (link_extra_cost < -0.01)
-                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
-              link_extra_cost = 0.0;
-            }
-            if (link_extra_cost < tok_extra_cost)
-              tok_extra_cost = link_extra_cost;
-            prev_link = link; // move to next link
-            link = link->next;
-          }
-        } // for all outgoing links
-        if (fabs(tok_extra_cost - tok->extra_cost) > delta)
-          changed = true;  // difference new minus old is bigger than delta
-        tok->extra_cost = tok_extra_cost;
-        // will be +infinity or <= lattice_beam_.
-        // infinity indicates, that no forward link survived pruning
-      } // for all Token on active_toks_[frame]
-      if (changed) *extra_costs_changed = true;
-
-      // Note: it's theoretically possible that aggressive compiler
-      // optimizations could cause an infinite loop here for small delta and
-      // high-dynamic-range scores.
-    } // while changed
-  }
-
-  // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
-  // on the final frame.  If there are final tokens active, it uses
-  // the final-probs for pruning, otherwise it treats all tokens as final.
-  void PruneForwardLinksFinal(int32 frame) {
-    KALDI_ASSERT(static_cast<size_t>(frame+1) == active_toks_.size());
-    if (active_toks_[frame].toks == NULL ) // empty list; should not happen.
-      KALDI_WARN << "No tokens alive at end of file\n";
-
-    // First go through, working out the best token (do it in parallel
-    // including final-probs and not including final-probs; we'll take
-    // the one with final-probs if it's valid).
-    const BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
-    BaseFloat best_cost_final = infinity,
-        best_cost_nofinal = infinity;
-    unordered_map<Token*, BaseFloat> tok_to_final_cost;
-    Elem *cur_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
-    for (Elem *e = cur_toks, *e_tail; e != NULL;  e = e_tail) {
-      PairId state_pair = e->key;
-      StateId state = PairToState(state_pair),
-          lm_state = PairToLmState(state_pair);
-      Token *tok = e->val;
-      BaseFloat final_cost = fst_.Final(state).Value() +
-          lm_diff_fst_->Final(lm_state).Value();
-      tok_to_final_cost[tok] = final_cost;
-      best_cost_final = std::min(best_cost_final, tok->tot_cost + final_cost);
-      best_cost_nofinal = std::min(best_cost_nofinal, tok->tot_cost);
-      e_tail = e->tail;
-      toks_.Delete(e);
-    }
-    final_active_ = (best_cost_final != infinity);
-    
-    // Now go through tokens on this frame, pruning forward links...  may have
-    // to iterate a few times until there is no more change, because the list is
-    // not in topological order.
-
-    bool changed = true;
-    BaseFloat delta = 1.0e-05;
-    while (changed) {
-      changed = false;
-      for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {
-        ForwardLink *link, *prev_link=NULL;
-        // will recompute tok_extra_cost.  It has a term in it that corresponds
-        // to the "final-prob", so instead of initializing tok_extra_cost to infinity
-        // below we set it to the difference between the (score+final_prob) of this token,
-        // and the best such (score+final_prob).
-        BaseFloat tok_extra_cost;
-        if (final_active_) {
-          BaseFloat final_cost = tok_to_final_cost[tok];
-          tok_extra_cost = (tok->tot_cost + final_cost) - best_cost_final;
-        } else 
-          tok_extra_cost = tok->tot_cost - best_cost_nofinal;
-      
-        for (link = tok->links; link != NULL; ) {
-          // See if we need to excise this link...
-          Token *next_tok = link->next_tok;
-          BaseFloat link_extra_cost = next_tok->extra_cost +
-              ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
-               - next_tok->tot_cost);
-          if (link_extra_cost > config_.lattice_beam) { // excise link
-            ForwardLink *next_link = link->next;
-            if (prev_link != NULL) prev_link->next = next_link;
-            else tok->links = next_link;
-            delete link;
-            link = next_link; // advance link but leave prev_link the same.
-          } else { // keep the link and update the tok_extra_cost if needed.
-            if (link_extra_cost < 0.0) { // this is just a precaution.
-              if (link_extra_cost < -0.01)
-                KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
-              link_extra_cost = 0.0;
-            }
-            if (link_extra_cost < tok_extra_cost)
-              tok_extra_cost = link_extra_cost;
-            prev_link = link;
-            link = link->next;
-          }
-        }
-        // prune away tokens worse than lattice_beam above best path.  This step
-        // was not necessary in the non-final case because then, this case
-        // showed up as having no forward links.  Here, the tok_extra_cost has
-        // an extra component relating to the final-prob.
-        if (tok_extra_cost > config_.lattice_beam)
-          tok_extra_cost = infinity;
-          // to be pruned in PruneTokensForFrame
-
-        if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta))
-          changed = true;
-        tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
-      }
-    } // while changed
-
-    // Now put surviving Tokens in the final_costs_ hash, which is a class
-    // member (unlike tok_to_final_costs).
-    for (Token *tok = active_toks_[frame].toks; tok != NULL; tok = tok->next) {    
-      if (tok->extra_cost != infinity) {
-        // If the token was not pruned away, 
-        if (final_active_) {
-          BaseFloat final_cost = tok_to_final_cost[tok];         
-          if (final_cost != infinity)
-            final_costs_[tok] = final_cost;
-        } else {
-          final_costs_[tok] = 0;
-        }
-      }
-    }
-  }
-  
-  // Prune away any tokens on this frame that have no forward links.
-  // [we don't do this in PruneForwardLinks because it would give us
-  // a problem with dangling pointers].
-  // It's called by PruneActiveTokens if any forward links have been pruned
-  void PruneTokensForFrame(int32 frame) {
-    KALDI_ASSERT(frame >= 0 && frame < active_toks_.size());
-    Token *&toks = active_toks_[frame].toks;
-    if (toks == NULL)
-      KALDI_WARN << "No tokens alive [doing pruning]\n";
-    Token *tok, *next_tok, *prev_tok = NULL;
-    for (tok = toks; tok != NULL; tok = next_tok) {
-      next_tok = tok->next;
-      if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
-        // token is unreachable from end of graph; (no forward links survived)
-        // excise tok from list and delete tok.
-        if (prev_tok != NULL) prev_tok->next = tok->next;
-        else toks = tok->next;
-        delete tok;
-        num_toks_--;
-      } else { // fetch next Token
-        prev_tok = tok;
-      }
-    }
-  }
-  
-  // Go backwards through still-alive tokens, pruning them.  note: cur_frame is
-  // where hash toks_ are (so we do not want to mess with it because these tokens
-  // don't yet have forward pointers), but we do all previous frames, unless we
-  // know that we can safely ignore them because the frame after them was unchanged.
-  // delta controls when it considers a cost to have changed enough to continue
-  // going backward and propagating the change.
-  // for a larger delta, we will recurse less far back
-  void PruneActiveTokens(int32 cur_frame, BaseFloat delta) {
-    int32 num_toks_begin = num_toks_;
-    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
-      // Reason why we need to prune forward links in this situation:
-      // (1) we have never pruned them (new TokenList)
-      // (2) we have not yet pruned the forward links to the next frame,
-      // after any of those tokens have changed their extra_cost.
-      if (active_toks_[frame].must_prune_forward_links) {
-        bool extra_costs_changed = false, links_pruned = false;
-        PruneForwardLinks(frame, &extra_costs_changed, &links_pruned, delta);
-        if (extra_costs_changed && frame > 0) // any token has changed extra_cost
-          active_toks_[frame-1].must_prune_forward_links = true;
-        if (links_pruned) // any link was pruned
-          active_toks_[frame].must_prune_tokens = true;
-        active_toks_[frame].must_prune_forward_links = false; // job done
-      }
-      if (frame+1 < cur_frame &&      // except for last frame (no forward links)
-         active_toks_[frame+1].must_prune_tokens) {
-        PruneTokensForFrame(frame+1);
-        active_toks_[frame+1].must_prune_tokens = false;
-      }
-    }
-    KALDI_VLOG(3) << "PruneActiveTokens: pruned tokens from " << num_toks_begin
-                  << " to " << num_toks_;
-  }
-
-  // Version of PruneActiveTokens that we call on the final frame.
-  // Takes into account the final-prob of tokens.
-  void PruneActiveTokensFinal(int32 cur_frame) {
-    // returns true if there were final states active
-    // else returns false and treats all states as final while doing the pruning
-    // (this can be useful if you want partial lattice output,
-    // although it can be dangerous, depending what you want the lattices for).
-    // final_active_ and final_probs_ (a hash) are set internally
-    // by PruneForwardLinksFinal
-    int32 num_toks_begin = num_toks_;
-    PruneForwardLinksFinal(cur_frame); // prune final frame (with final-probs)
-    // sets final_active_ and final_probs_
-    for (int32 frame = cur_frame-1; frame >= 0; frame--) {
-      bool b1, b2; // values not used.
-      BaseFloat dontcare = 0.0; // delta of zero means we must always update
-      PruneForwardLinks(frame, &b1, &b2, dontcare);
-      PruneTokensForFrame(frame+1);
-    }
-    PruneTokensForFrame(0); 
-    KALDI_VLOG(3) << "PruneActiveTokensFinal: pruned tokens from " << num_toks_begin
-                  << " to " << num_toks_;
-  }
-  
-  /// Gets the weight cutoff.  Also counts the active tokens.
-  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
-                      BaseFloat *adaptive_beam, Elem **best_elem) {
-  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
-  // positive == high cost == bad.
-  size_t count = 0;
-  if (config_.max_active == std::numeric_limits<int32>::max() &&
-      config_.min_active == 0) {
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
-      }
-    }
-    if (tok_count != NULL) *tok_count = count;
-    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
-    return best_weight + config_.beam;
-  } else {
-    tmp_array_.clear();
-    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
-      BaseFloat w = e->val->tot_cost;
-      tmp_array_.push_back(w);
-      if (w < best_weight) {
-        best_weight = w;
-        if (best_elem) *best_elem = e;
-      }
-    }
-    if (tok_count != NULL) *tok_count = count;
-
-    BaseFloat beam_cutoff = best_weight + config_.beam,
-        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
-        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
-
-    KALDI_VLOG(6) << "Number of tokens active on frame " << active_toks_.size()
-                  << " is " << tmp_array_.size();
-
-    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
-      std::nth_element(tmp_array_.begin(),
-                       tmp_array_.begin() + config_.max_active,
-                       tmp_array_.end());
-      max_active_cutoff = tmp_array_[config_.max_active];
-    }
-    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
-      if (adaptive_beam)
-        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
-      return max_active_cutoff;
-    }
-    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
-      if (config_.min_active == 0) min_active_cutoff = best_weight;
-      else {
-        std::nth_element(tmp_array_.begin(),
-                         tmp_array_.begin() + config_.min_active,
-                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
-                         tmp_array_.begin() + config_.max_active :
-                         tmp_array_.end());
-        min_active_cutoff = tmp_array_[config_.min_active];
-      }
-    }
-    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
-      if (adaptive_beam)
-        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
-      return min_active_cutoff;
-    } else {
-      *adaptive_beam = config_.beam;
-      return beam_cutoff;
-    }
-  }
-  }
-
-  inline StateId PropagateLm(StateId lm_state,
-                             Arc *arc, bool *pp=NULL) { // returns new LM state.
-    if (arc->olabel == 0) {
-      if (pp) *pp=false;
-      return lm_state; // no change in LM state if no word crossed.
-    } else { // Propagate in the LM-diff FST.
-      if (pp) *pp=false;
-      Arc lm_arc;
-      bool ans = lm_diff_fst_->GetArc(lm_state, arc->olabel, &lm_arc);
-      if (!ans) { // this case is unexpected for statistical LMs.
-        if (!warned_noarc_) {
-          warned_noarc_ = true;
-          KALDI_WARN << "No arc available in LM (unlikely to be correct "
-              "if a statistical language model); will not warn again";
-        }
-        arc->weight = Weight::Zero();
-        return lm_state; // doesn't really matter what we return here; will
-        // be pruned.
-      } else {
-        arc->weight = Times(arc->weight, lm_arc.weight);
-        arc->olabel = lm_arc.olabel; // probably will be the same.
-        return lm_arc.nextstate; // return the new LM state.
-      }      
-    }
-  }
-  
-  void ProcessEmitting(DecodableInterface *decodable, int32 frame) {
-    // Processes emitting arcs for one frame.  Propagates from prev_toks_ to cur_toks_.
-    Elem *last_toks = toks_.Clear(); // swapping prev_toks_ / cur_toks_
-    DeleteElems_1(toks_g1.Clear());
-    Elem *best_elem = NULL;
-    BaseFloat adaptive_beam;
-    size_t tok_cnt;
-    BaseFloat cur_cutoff = GetCutoff(last_toks, &tok_cnt, &adaptive_beam, &best_elem);
-    PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.    
-    KALDI_VLOG(6) << "Adaptive beam on frame " << frame << "\t" << active_toks_.size() << " is "
-                << adaptive_beam << "\t" << cur_cutoff;
-
-  
-    BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
-    // pruning "online" before having seen all tokens
-
-    // First process the best token to get a hopefully
-    // reasonably tight bound on the next cutoff.
-    if (best_elem) {
-      PairId state_pair = best_elem->key;
-      StateId state = PairToState(state_pair), // state in "fst"
-          lm_state = PairToLmState(state_pair);
-      Token *tok = best_elem->val;
-      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
-           !aiter.Done();
-           aiter.Next()) {
-        Arc arc = aiter.Value();
-        if (arc.ilabel != 0) {  // propagate..
-          PropagateLm(lm_state, &arc); // may affect "arc.weight".
-          // We don't need the return value (the new LM state).
-          arc.weight = Times(arc.weight,
-                             Weight(-decodable->LogLikelihood(frame-1, arc.ilabel)));
-          BaseFloat new_weight = arc.weight.Value() + tok->tot_cost;
-          if (new_weight + adaptive_beam < next_cutoff)
-            next_cutoff = new_weight + adaptive_beam;
-        }
-      }
-    }
-    
-    // the tokens are now owned here, in last_toks, and the hash is empty.
-    // 'owned' is a complex thing here; the point is we need to call DeleteElem
-    // on each elem 'e' to let toks_ know we're done with them.
-    for (Elem *e = last_toks, *e_tail; e != NULL; e = e_tail) {
-      // loop this way because we delete "e" as we go.
-      PairId state_pair = e->key;
-      StateId state = PairToState(state_pair),
-          lm_state = PairToLmState(state_pair);
-      Token *tok = e->val;
-      if (tok->tot_cost <=  cur_cutoff) {
-        for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
-             !aiter.Done();
-             aiter.Next()) {
-          const Arc &arc_ref = aiter.Value();
-          if (arc_ref.ilabel != 0) {  // propagate..
-            Arc arc(arc_ref);
-            bool pp=arc.olabel>0;
-            BaseFloat ac_cost = -decodable->LogLikelihood(frame-1, arc.ilabel);
-            if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + ac_cost+ arc.weight.Value(), true, NULL, pp)) continue;
-            StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);
-            BaseFloat graph_cost = arc.weight.Value(),
-                cur_cost = tok->tot_cost,
-                tot_cost = cur_cost + ac_cost + graph_cost;
-            if (tot_cost > next_cutoff) continue;
-            else if (tot_cost + config_.beam < next_cutoff)
-              next_cutoff = tot_cost + config_.beam; // prune by best current token
-            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *next_tok = FindOrAddToken_2(next_pair, frame, tot_cost, true, NULL);
-            // true: emitting, NULL: no change indicator needed
-          
-            // Add ForwardLink from tok to next_tok (put on head of list tok->links)
-            tok->links = new ForwardLink(next_tok, arc.ilabel, arc.olabel, 
-                                         graph_cost, ac_cost, tok->links);
-          }
-        } // for all arcs
-      }
-      e_tail = e->tail;
-      toks_.Delete(e); // delete Elem
-    }
-  }
-
-  void ProcessNonemitting(int32 frame) {
-    // note: "frame" is the same as emitting states just processed.
-    
-    // Processes nonemitting arcs for one frame.  Propagates within toks_.
-    // Note-- this queue structure is is not very optimal as
-    // it may cause us to process states unnecessarily (e.g. more than once),
-    // but in the baseline code, turning this vector into a set to fix this
-    // problem did not improve overall speed.
-
-    KALDI_ASSERT(queue_.empty());
-    BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
-    for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
-      queue_.push_back(e->key);
-      // for pruning with current best token
-      best_cost = std::min(best_cost, static_cast<BaseFloat>(e->val->tot_cost));
-    }
-    if (queue_.empty()) {
-      if (!warned_) {
-        KALDI_ERR << "Error in ProcessEmitting: no surviving tokens: frame is "
-                  << frame;
-        warned_ = true;
-      }
-    }
-    BaseFloat cutoff = best_cost + config_.beam;
-    
-    while (!queue_.empty()) {
-      PairId state_pair = queue_.back();
-      queue_.pop_back();
-
-      Token *tok = toks_.Find(state_pair)->val;  // would segfault if state not in
-                                                 // toks_ but this can't happen.
-      BaseFloat cur_cost = tok->tot_cost;
-      if (cur_cost > cutoff) // Don't bother processing successors.
-        continue;
-      StateId state = PairToState(state_pair),
-          lm_state = PairToLmState(state_pair);
-      // If "tok" has any existing forward links, delete them,
-      // because we're about to regenerate them.  This is a kind
-      // of non-optimality (remember, this is the simple decoder),
-      // but since most states are emitting it's not a huge issue.
-      tok->DeleteForwardLinks(); // necessary when re-visiting
-      tok->links = NULL;
-      for (fst::ArcIterator<fst::Fst<Arc> > aiter(fst_, state);
-          !aiter.Done();
-          aiter.Next()) {
-        const Arc &arc_ref = aiter.Value();
-        if (arc_ref.ilabel == 0) {  // propagate nonemitting only...
-          Arc arc(arc_ref);
-          bool pp=arc.olabel>0;
-          if (!FindOrAddToken(arc.nextstate, frame, tok->tot_cost + arc.weight.Value(), true, NULL, pp)) continue;
-          StateId next_lm_state = PropagateLm(lm_state, &arc, &pp);          
-          BaseFloat graph_cost = arc.weight.Value(),
-              tot_cost = cur_cost + graph_cost;
-          if (tot_cost < cutoff) {
-            bool changed;
-            PairId next_pair = ConstructPair(arc.nextstate, next_lm_state);
-            Token *new_tok = FindOrAddToken_2(next_pair, frame, tot_cost,
-                                            false, &changed); // false: non-emit
-            
-            tok->links = new ForwardLink(new_tok, 0, arc.olabel,
-                                         graph_cost, 0, tok->links);
-            
-            // "changed" tells us whether the new token has a different
-            // cost from before, or is new [if so, add into queue].
-            if (changed) queue_.push_back(next_pair);
-          }
-        }
-      } // for all arcs
-    } // while queue not empty
-  }
-
-
-  // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
-  // more than one list (e.g. for current and previous frames), but only one of
-  // them at a time can be indexed by StateId.
-  HashList<PairId, Token*> toks_;
-  HashList<StateId, BaseFloat> toks_g1;
-  std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
-  // frame (members of TokenList are toks, must_prune_forward_links,
-  // must_prune_tokens).
-  std::vector<PairId> queue_;  // temp variable used in ProcessNonemitting,
-  std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
-  // make it class member to avoid internal new/delete.
-  const fst::Fst<fst::StdArc> &fst_;
-  fst::DeterministicOnDemandFst<fst::StdArc> *lm_diff_fst_;  
-  LatticeBiglmFasterDecoderConfig config_;
-  bool warned_noarc_;  
-  int32 num_toks_; // current total #toks allocated...
-  bool warned_;
-  bool final_active_; // use this to say whether we found active final tokens
-  // on the last frame.
-  std::map<Token*, BaseFloat> final_costs_; // A cache of final-costs
-  // of tokens on the last frame-- it's just convenient to store it this way.
-  
-  // It might seem unclear why we call DeleteElems(toks_.Clear()).
-  // There are two separate cleanup tasks we need to do at when we start a new file.
-  // one is to delete the Token objects in the list; the other is to delete
-  // the Elem objects.  toks_.Clear() just clears them from the hash and gives ownership
-  // to the caller, who then has to call toks_.Delete(e) for each one.  It was designed
-  // this way for convenience in propagating tokens from one frame to the next.
-  void DeleteElems(Elem *list) {
-    for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
-      e_tail = e->tail;
-      toks_.Delete(e);
-    }
-    toks_.Clear();
-    DeleteElems_1(toks_g1.Clear());
-  }
-  void DeleteElems_1(Elem_g1 *list) {
-    for (Elem_g1 *e = list, *e_tail; e != NULL; e = e_tail) {
-      e_tail = e->tail;
-      toks_g1.Delete(e);
-    }
-    toks_g1.Clear();
-  }
-  
-  void ClearActiveTokens() { // a cleanup routine, at utt end/begin
-    for (size_t i = 0; i < active_toks_.size(); i++) {
-      // Delete all tokens alive on this frame, and any forward
-      // links they may have.
-      for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
-        tok->DeleteForwardLinks();
-        Token *next_tok = tok->next;
-        delete tok;
-        num_toks_--;
-        tok = next_tok;
-      }
-    }
-    active_toks_.clear();
-    KALDI_ASSERT(num_toks_ == 0);
-  }
-};
-
-} // end namespace kaldi.
-
-#endif
diff --git a/src/fstext/deterministic-fst-inl.h b/src/fstext/deterministic-fst-inl.h
index 22de188f849..3e1524ed53f 100644
--- a/src/fstext/deterministic-fst-inl.h
+++ b/src/fstext/deterministic-fst-inl.h
@@ -206,89 +206,6 @@ bool ComposeDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   return true;
 }
 
-template<class Arc>
-inline size_t  PreinitDeterministicOnDemandFst<Arc>::GetIndex(
-    StateId src_state, Label ilabel) {
-  const StateId p1 = 26597, p2 = 50329; // these are two
-  // values that I drew at random from a table of primes.
-  // note: num_cached_arcs_ > 0.
-
-  // We cast to size_t before the modulus, to ensure the
-  // result is positive.
-  return static_cast<size_t>(src_state * p1 + ilabel * p2) %
-      static_cast<size_t>(num_cached_arcs_);
-}
-
-template<class Arc>
- PreinitDeterministicOnDemandFst<Arc>::PreinitDeterministicOnDemandFst(
-    DeterministicOnDemandFst<Arc> *fst,
-    StateId num_cached_arcs, int32 init_mode, Fst<Arc>* pat_fst): fst_(fst),
-                              num_cached_arcs_(num_cached_arcs), num_cached_arcs_used_(0) {
-  KALDI_ASSERT(num_cached_arcs > 0);
-  cached_arcs_.reserve(num_cached_arcs_);
-
-  if (init_mode == 1) {
-#define MAX_LEV 20
-      std::unordered_map<StateId, std::pair<StateId, int32>> lev_map;
-      std::queue<StateId> q;
-      int cache_arcs = 0;
-      q.push(pat_fst->Start());
-      lev_map[pat_fst->Start()]=std::pair<StateId, int32>(fst_->Start(), 0);
-      while(!q.empty()) {
-        StateId st = q.front();
-        q.pop();
-        int32 level = lev_map[st].second;
-        StateId st_det = lev_map[st].first;
-        // TODO: rewrite this for
-        for (fst::ArcIterator<fst::Fst<Arc> > aiter(*pat_fst, st);
-            !aiter.Done();
-            aiter.Next()) {
-          const Arc &arc = aiter.Value();
-          if (arc.ilabel == 0) {
-            lev_map[arc.nextstate] = std::pair<StateId, int32>(st_det, level);
-            q.push(arc.nextstate);
-            continue;
-          }
-          Arc oarc;
-          KALDI_VLOG(8)<<st_det << " " << arc.ilabel << oarc.nextstate;
-          if (!GetArc(st_det, arc.ilabel, &oarc)) continue;
-          cache_arcs++;
-          auto ret = lev_map.find(arc.nextstate);
-          if (ret == lev_map.end() && level < MAX_LEV) {
-            lev_map[arc.nextstate] = std::pair<StateId, int32>(oarc.nextstate, level + 1);
-            q.push(arc.nextstate);
-          }
-        }
-      }
-     KALDI_VLOG(0) << "preInit state num: " << lev_map.size() << " " << MAX_LEV << " cache: "<< 1.0*num_cached_arcs_used_/num_cached_arcs_ << "% " << 1.0*cache_arcs / num_cached_arcs_used_;
-  }
-}
-
-template<class Arc>
-bool  PreinitDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
-                                                Arc *oarc) {
-  KALDI_ASSERT(s >= 0 && ilabel != 0);
-  index_type idx(s, ilabel);
-  auto ret = cached_arcs_.find(idx);
-  if (ret == cached_arcs_.end()) {
-    Arc arc;
-    if (fst_->GetArc(s, ilabel, &arc)) {
-      if (num_cached_arcs_used_<num_cached_arcs_) {
-        cached_arcs_[idx]=arc; 
-        num_cached_arcs_used_++;
-      }
-      *oarc = arc;
-      return true;
-    } else {
-      return false;
-    }
-  } else {
-    *oarc = ret->second;
-    return true;
-  }
-}
-
-
 template<class Arc>
 inline size_t CacheDeterministicOnDemandFst<Arc>::GetIndex(
     StateId src_state, Label ilabel) {
@@ -305,9 +222,9 @@ inline size_t CacheDeterministicOnDemandFst<Arc>::GetIndex(
 template<class Arc>
 CacheDeterministicOnDemandFst<Arc>::CacheDeterministicOnDemandFst(
     DeterministicOnDemandFst<Arc> *fst,
-    StateId num_cached_arcs, bool overwrite): fst_(fst),
+    StateId num_cached_arcs): fst_(fst),
                               num_cached_arcs_(num_cached_arcs),
-                              cached_arcs_(num_cached_arcs), overwrite_(overwrite) {
+                              cached_arcs_(num_cached_arcs) {
   KALDI_ASSERT(num_cached_arcs > 0);
   for (StateId i = 0; i < num_cached_arcs; i++)
     cached_arcs_[i].first = kNoStateId; // Invalidate all elements of the cache.
@@ -328,10 +245,8 @@ bool CacheDeterministicOnDemandFst<Arc>::GetArc(StateId s, Label ilabel,
   } else {
     Arc arc;
     if (fst_->GetArc(s, ilabel, &arc)) {
-      if (overwrite_ || cached_arcs_[index].first == kNoStateId) {
-        cached_arcs_[index].first = s;
-        cached_arcs_[index].second = arc;
-      }
+      cached_arcs_[index].first = s;
+      cached_arcs_[index].second = arc;
       *oarc = arc;
       return true;
     } else {
diff --git a/src/fstext/deterministic-fst.h b/src/fstext/deterministic-fst.h
index 1797f3cc6ea..5dc616794ca 100644
--- a/src/fstext/deterministic-fst.h
+++ b/src/fstext/deterministic-fst.h
@@ -226,36 +226,6 @@ class ComposeDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
   StateId start_state_;
 };
 
-template<class Arc>
-class PreinitDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
- public:
-  typedef typename Arc::StateId StateId;
-  typedef typename Arc::Weight Weight;
-  typedef typename Arc::Label Label;
-  typedef std::pair<StateId, StateId> index_type;
-
-  /// We don't take ownership of this pointer.  The argument is "really" const.
-  PreinitDeterministicOnDemandFst(DeterministicOnDemandFst<Arc> *fst,
-                                StateId num_layer = 1, int init_mode=0, Fst<Arc>* pat_fst=NULL);
-
-  virtual StateId Start() { return fst_->Start(); }
-
-  /// We don't bother caching the final-probs, just the arcs.
-  virtual Weight Final(StateId s) { return fst_->Final(s); }
-
-  virtual bool GetArc(StateId s, Label ilabel, Arc *oarc);
-
- private:
-  // Get index for cached arc.
-  inline size_t GetIndex(StateId src_state, Label ilabel);
-
-  DeterministicOnDemandFst<Arc> *fst_;
-  StateId num_cached_arcs_;
-  std::unordered_map<index_type, Arc, kaldi::PairHasher<StateId>> cached_arcs_;
-  StateId num_cached_arcs_used_;
-};
-
-
 template<class Arc>
 class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
  public:
@@ -265,7 +235,7 @@ class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
 
   /// We don't take ownership of this pointer.  The argument is "really" const.
   CacheDeterministicOnDemandFst(DeterministicOnDemandFst<Arc> *fst,
-                                StateId num_cached_arcs = 100000, bool overwrite = true);
+                                StateId num_cached_arcs = 100000);
 
   virtual StateId Start() { return fst_->Start(); }
 
@@ -281,7 +251,6 @@ class CacheDeterministicOnDemandFst: public DeterministicOnDemandFst<Arc> {
   DeterministicOnDemandFst<Arc> *fst_;
   StateId num_cached_arcs_;
   std::vector<std::pair<StateId, Arc> > cached_arcs_;
-  bool overwrite_;
 };
 
 
diff --git a/src/fstext/kaldi-fst-io.cc b/src/fstext/kaldi-fst-io.cc
index acffd50045e..acbd9e59000 100644
--- a/src/fstext/kaldi-fst-io.cc
+++ b/src/fstext/kaldi-fst-io.cc
@@ -76,8 +76,6 @@ Fst<StdArc> *ReadFstKaldiGeneric(std::string rxfilename, bool throw_on_err) {
     fst = ConstFst<StdArc>::Read(ki.Stream(), ropts);
   } else if (hdr.FstType() == "vector") {
     fst = VectorFst<StdArc>::Read(ki.Stream(), ropts);
-  } else {
-    fst = Fst<StdArc>::Read(ki.Stream(), ropts);
   }
   if (!fst) {
     if(throw_on_err) {
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index e3ea9ffd8c6..9809cdcbb85 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -27,8 +27,6 @@ BINFILES = lattice-best-path lattice-prune lattice-equivalent lattice-to-nbest \
            lattice-arc-post lattice-determinize-non-compact lattice-lmrescore-kaldi-rnnlm \
            lattice-lmrescore-pruned lattice-lmrescore-kaldi-rnnlm-pruned lattice-reverse
 
-BINFILES += lattice-lmrescore-fasterlm-arpa
-
 OBJFILES =
 
 
diff --git a/src/latbin/lattice-lmrescore-fasterlm-arpa.cc b/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
deleted file mode 100644
index 85679dcd15b..00000000000
--- a/src/latbin/lattice-lmrescore-fasterlm-arpa.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// latbin/lattice-lmrescore-const-arpa.cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "fstext/fstext-lib.h"
-#include "lat/kaldi-lattice.h"
-#include "lat/lattice-functions.h"
-#include "lm/faster-arpa-lm.h"
-#include "util/common-utils.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    typedef kaldi::int64 int64;
-
-    const char *usage =
-        "Rescores lattice with the ConstArpaLm format language model. The LM\n"
-        "will be wrapped into the DeterministicOnDemandFst interface and the\n"
-        "rescoring is done by composing with the wrapped LM using a special\n"
-        "type of composition algorithm. Determinization will be applied on\n"
-        "the composed lattice.\n"
-        "\n"
-        "Usage: lattice-lmrescore-const-arpa [options] lattice-rspecifier \\\n"
-        "                                   const-arpa-in lattice-wspecifier\n"
-        " e.g.: lattice-lmrescore-const-arpa --lm-scale=-1.0 ark:in.lats \\\n"
-        "                                   const_arpa ark:out.lats\n";
-
-    ParseOptions po(usage);
-    BaseFloat lm_scale = 1.0;
-    int32 symbol_size = 0;
-
-    po.Register("lm-scale", &lm_scale, "Scaling factor for language model "
-                "costs; frequently 1.0 or -1.0");
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 3) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string lats_rspecifier = po.GetArg(1),
-        lm_rxfilename = po.GetArg(2),
-        lats_wspecifier = po.GetArg(3);
-
-    // Reads the language model in FasterArpaLm format.
-    FasterArpaLm new_lm(arpa_options, lm_rxfilename, symbol_size);
-    FasterArpaLmDeterministicFst const_arpa_fst(new_lm);
-
-
-    // Reads and writes as compact lattice.
-    SequentialCompactLatticeReader compact_lattice_reader(lats_rspecifier);
-    CompactLatticeWriter compact_lattice_writer(lats_wspecifier);
-
-    int32 n_done = 0, n_fail = 0;
-    for (; !compact_lattice_reader.Done(); compact_lattice_reader.Next()) {
-      std::string key = compact_lattice_reader.Key();
-      CompactLattice clat = compact_lattice_reader.Value();
-      compact_lattice_reader.FreeCurrent();
-
-      if (lm_scale != 0.0) {
-        // Before composing with the LM FST, we scale the lattice weights
-        // by the inverse of "lm_scale".  We'll later scale by "lm_scale".
-        // We do it this way so we can determinize and it will give the
-        // right effect (taking the "best path" through the LM) regardless
-        // of the sign of lm_scale.
-        fst::ScaleLattice(fst::GraphLatticeScale(1.0/lm_scale), &clat);
-        ArcSort(&clat, fst::OLabelCompare<CompactLatticeArc>());
-
-        // Wraps the ConstArpaLm format language model into FST. We re-create it
-        // Composes lattice with language model.
-        CompactLattice composed_clat;
-        ComposeCompactLatticeDeterministic(clat,
-                                           &const_arpa_fst, &composed_clat);
-
-        // Determinizes the composed lattice.
-        Lattice composed_lat;
-        ConvertLattice(composed_clat, &composed_lat);
-        Invert(&composed_lat);
-        CompactLattice determinized_clat;
-        DeterminizeLattice(composed_lat, &determinized_clat);
-        fst::ScaleLattice(fst::GraphLatticeScale(lm_scale), &determinized_clat);
-        if (determinized_clat.Start() == fst::kNoStateId) {
-          KALDI_WARN << "Empty lattice for utterance " << key
-              << " (incompatible LM?)";
-          n_fail++;
-        } else {
-          compact_lattice_writer.Write(key, determinized_clat);
-          n_done++;
-        }
-      } else {
-        // Zero scale so nothing to do.
-        n_done++;
-        compact_lattice_writer.Write(key, clat);
-      }
-    }
-
-    KALDI_LOG << "Done " << n_done << " lattices, failed for " << n_fail;
-    return (n_done != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what();
-    return -1;
-  }
-}
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 4201fb352a8..c0654fa83b2 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -6,10 +6,8 @@ include ../kaldi.mk
 
 TESTFILES = arpa-file-parser-test arpa-lm-compiler-test
 
-TESTFILES += faster-arpa-lm-test
-
 OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
-	   kaldi-rnnlm.o mikolov-rnnlm-lib.o  faster-arpa-lm.o 
+	   kaldi-rnnlm.o mikolov-rnnlm-lib.o
 
 LIBNAME = kaldi-lm
 
diff --git a/src/lm/faster-arpa-lm-test.cc b/src/lm/faster-arpa-lm-test.cc
deleted file mode 100644
index 9a72289be23..00000000000
--- a/src/lm/faster-arpa-lm-test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-// bin/latgen-fasterlm-faster-mapped .cc
-
-// Copyright      2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "tree/context-dep.h"
-#include "hmm/transition-model.h"
-#include "fstext/fstext-lib.h"
-#include "decoder/decoder-wrappers.h"
-#include "decoder/decodable-matrix.h"
-#include "base/timer.h"
-#include "lm/faster-arpa-lm.h"
-#include "lm/const-arpa-lm.h"
-#include "decoder/lattice-biglm-faster-decoder.h"
-
-//echo 14207 198712 7589 175861 175861 104488 150861 139719 78075 14268 124782 61783 196158 4 20681 194454 137421 158810 161569 4 37434 50498 | awk '{for (i=1;i<=NF;i++)printf $i", "}END{print "\n"NF}'
-// ~/src/kaldi/src/lm/faster-arpa-lm-test --symbol-size=200007 --bos-symbol=200005 --eos-symbol=200006 --unk-symbol=3 --verbose=7  'fstproject --project_output=true data/lang_test_tgmed/G.fst | fstarcsort --sort_type=ilabel |' data/lang_nosp_test_tgmed/G.carpa 'gunzip -c data/local/lm/3-gram.pruned.1e-7.arpa.gz| utils/map_arpa_lm.pl data/lang_test_tgsmall/words.txt|'
-//
-namespace kaldi {
-
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-#define  Arc fst::StdArc
-    using fst::ReadFstKaldi;
-
-
-void get_score(fst::CacheDeterministicOnDemandFst<StdArc>* cache_dfst,
-    int* word_ids, int* state_ids, float* scores, int len) {
-  state_ids[0]=cache_dfst->Start();
-  std::cout << "word,state,score: \n";
-  for (int i =0;i<len;i++) {
-  Arc lm_arc;
-  assert(cache_dfst->GetArc(state_ids[i], word_ids[i], &lm_arc));
-  if (i< len-1) state_ids[i+1]=lm_arc.nextstate;
-  scores[i]=lm_arc.weight.Value();
-  std::cout <<word_ids[i]<<","<<state_ids[i]<<","<<scores[i]<<"\n";
-  }
-}
-}
-int main(int argc, char *argv[]) {
-    using namespace kaldi;
-    typedef kaldi::int32 int32;
-    using fst::SymbolTable;
-    using fst::VectorFst;
-    using fst::Fst;
-    using fst::StdArc;
-#define  Arc fst::StdArc
-    using fst::ReadFstKaldi;
-
-#define TEST_SIZE 39
-//#define TEST_SIZE 28
-//#define TEST_SIZE 25
-    ParseOptions po("");
-    float scores[TEST_SIZE];
-    float scores2[TEST_SIZE];
-    float scores3[TEST_SIZE];
-    //int32 word_ids[]={14207, 198712, 7589, 175861, 171937, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 137421, 158810, 161569, 4, 37434, 50498};
-    //int32 word_ids[] = {14207, 198712, 7589, 4, 171935, 87918, 124782, 36528, 175861, 104488, 150861, 139719, 78075, 14268, 124782, 61783, 196158, 4, 20681, 194454, 138359, 155516, 2379, 160908, 2811, 4, 37434, 50498};
-    int32 word_ids[] = {78521, 148206, 178313, 175861, 144826, 28459, 25372, 62655, 138328, 175861, 72352, 76155, 152997, 4, 102911, 177031, 193231, 127711, 71590, 47932, 151710, 40606, 5411, 82074, 86219, 81505, 77097, 4, 155384, 194419, 193822, 71589, 76098, 163928, 124918, 177084, 9376, 81505, 78840};
-    int32 state_ids[TEST_SIZE]={0};
-
-    ArpaParseOptions arpa_options;
-    arpa_options.Register(&po);
-    int32 symbol_size;
-    po.Register("symbol-size", &symbol_size, "symbol table size");
-    po.Register("unk-symbol", &arpa_options.unk_symbol,
-                "Integer corresponds to unknown-word in language model. -1 if "
-                "no such word is provided.");
-    po.Register("bos-symbol", &arpa_options.bos_symbol,
-                "Integer corresponds to <s>. You must set this to your actual "
-                "BOS integer.");
-    po.Register("eos-symbol", &arpa_options.eos_symbol,
-                "Integer corresponds to </s>. You must set this to your actual "
-                "EOS integer.");
-
-    po.Read(argc, argv);
-
-    {
-    std::string g_lm_fst_rxfilename = po.GetArg(1);
-    VectorFst<StdArc> *old_lm_fst = fst::CastOrConvertToVectorFst(
-        fst::ReadFstKaldiGeneric(g_lm_fst_rxfilename));
-    fst::BackoffDeterministicOnDemandFst<StdArc> old_lm_dfst(*old_lm_fst);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&old_lm_dfst, 1e7);
-    get_score(&cache_dfst, word_ids, state_ids, scores, TEST_SIZE);
-    }
-   {
-    std::string g_lm_fst_rxfilename = po.GetArg(2);
-    ConstArpaLm new_lm;
-    ReadKaldiObject(g_lm_fst_rxfilename, &new_lm);
-    ConstArpaLmDeterministicFst new_lm_dfst(new_lm);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
-    get_score(&cache_dfst, word_ids, state_ids, scores2, TEST_SIZE);
-    }
-   {
-    std::string g_lm_fst_rxfilename = po.GetArg(3);
-    FasterArpaLm new_lm(arpa_options, g_lm_fst_rxfilename, symbol_size);
-    FasterArpaLmDeterministicFst new_lm_dfst(new_lm);
-    fst::CacheDeterministicOnDemandFst<StdArc> cache_dfst(&new_lm_dfst, 1e7);
-    get_score(&cache_dfst, word_ids, state_ids, scores3, TEST_SIZE);
-   }
-   for (int i=0;i<TEST_SIZE;i++) {
-     if (abs(scores[i]-scores2[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores2[i]<< " "<<word_ids[i]<<" "<<i;
-     if (abs(scores[i]-scores3[i])>1e-4) KALDI_LOG<<scores[i]<< " "<< scores3[i]<< " "<<word_ids[i]<<" "<<i;
-   }
-   return 0;
-}
diff --git a/src/lm/faster-arpa-lm.cc b/src/lm/faster-arpa-lm.cc
deleted file mode 100644
index 81d0322ed5b..00000000000
--- a/src/lm/faster-arpa-lm.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-// lm/const-arpa-lm.cc
-
-// Copyright 2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include <algorithm>
-#include <limits>
-#include <sstream>
-#include <utility>
-
-#include "base/kaldi-math.h"
-#include "lm/arpa-file-parser.h"
-#include "lm/faster-arpa-lm.h"
-#include "util/stl-utils.h"
-#include "util/text-utils.h"
-
-
-namespace kaldi {
-
-
-
-}  // namespace kaldi
diff --git a/src/lm/faster-arpa-lm.h b/src/lm/faster-arpa-lm.h
deleted file mode 100644
index 40721eb3b0b..00000000000
--- a/src/lm/faster-arpa-lm.h
+++ /dev/null
@@ -1,492 +0,0 @@
-// lm/const-arpa-lm.h
-
-// Copyright 2018  Zhehuai Chen
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef KALDI_LM_FASTER_ARPA_LM_H_
-#define KALDI_LM_FASTER_ARPA_LM_H_
-
-#include <string>
-#include <vector>
-#include <math.h>
-
-#include "base/kaldi-common.h"
-#include "fstext/deterministic-fst.h"
-#include "lm/arpa-file-parser.h"
-#include "util/common-utils.h"
-
-namespace kaldi {
-
-uint64  RandInt64() {
-  uint64_t random =
-  (((uint64_t) rand() <<  0) & 0x000000000000FFFFull) ^ 
-  (((uint64_t) rand() << 16) & 0x00000000FFFF0000ull) ^ 
-  (((uint64_t) rand() << 32) & 0x0000FFFF00000000ull) ^
-  (((uint64_t) rand() << 48) & 0xFFFF000000000000ull);
-  return random;
-}
-#define MAX_NGRAM 5+1
-#define RAND_TYPE int64
-#define HASH_REDUNDANT 0.5
-class FasterArpaLm {
- public:
-  typedef fst::StdArc::StateId StateId;
-
-  // LmState in FasterArpaLm: the basic storage unit
-  class LmState {
-   public:
-    LmState(): logprob_(0), h_value(0), word_ids_(NULL), next(NULL) { }
-    LmState(float logprob, float backoff_logprob): 
-      logprob_(logprob), backoff_logprob_(backoff_logprob), h_value(0), word_ids_(NULL),
-    next(NULL) { }
-    void Allocate(const NGram* ngram, float lm_scale=1) {
-      logprob_ = ngram->logprob*lm_scale;
-      backoff_logprob_ = ngram->backoff*lm_scale;
-      /*
-      std::vector<int32> &word_ids = ngram->words;
-      int32 ngram_order = word_ids.size();
-      int32 sz= sizeof(int32)*(ngram_order);
-      */
-    }
-    void SaveWordIds(const int32 *word_ids, const int32 ngram_order) {
-      word_ids_ = (int32 *)malloc(sizeof(int32)*ngram_order);
-      for (int i=0; i<ngram_order; i++) word_ids_[i] = word_ids[i];
-      ngram_order_ = ngram_order;
-    }
-    bool IsExist() const { return logprob_!=0; }
-    ~LmState() { if (word_ids_) free(word_ids_); }
-
-    // for current query
-    float logprob_;
-    // for next query; can be optional
-    float backoff_logprob_;
-    RAND_TYPE h_value;
-    int32 *word_ids_;
-    int32 ngram_order_;
-    LmState* next; // for colid
-    int32 lm_state_reco_;
-  };
-
-  // Class to build FasterArpaLm from Arpa format language model. It relies on the
-  // auxiliary class LmState above.
-  class FasterArpaLmBuilder : public ArpaFileParser {
-   public:
-    FasterArpaLmBuilder(ArpaParseOptions &options, FasterArpaLm *lm, 
-      float lm_scale = 1): ArpaFileParser(options, NULL),
-    lm_(lm), lm_scale_(lm_scale) { }
-    ~FasterArpaLmBuilder() { }
-
-   protected:
-    // ArpaFileParser overrides.
-    virtual void HeaderAvailable() {
-      lm_->Allocate(NgramCounts(), 
-          Options().bos_symbol, Options().eos_symbol, Options().unk_symbol);
-    }
-    virtual void ConsumeNGram(const NGram& ngram) {
-      LmState lm_state(ngram.logprob * lm_scale_, ngram.backoff * lm_scale_);
-      lm_->SaveHashedState(ngram.words, lm_state, true);
-    }
-
-    virtual void ReadComplete()  { 
-      lm_->SaveRecoState();
-    }
-
-   private:
-    FasterArpaLm *lm_;
-    float lm_scale_;
-  };
-
-  FasterArpaLm(ArpaParseOptions &options, const std::string& arpa_rxfilename,
-    int32 symbol_size, float lm_scale=1): symbol_size_(symbol_size), options_(options) {
-    assert(symbol_size_);
-    is_built_ = false;
-    ngram_order_ = 0;
-    num_words_ = 0;
-    lm_states_size_ = 0;
-    randint_per_word_gram_ = NULL;
-    max_collision_ = 0;
-
-    BuildFasterArpaLm(arpa_rxfilename, lm_scale);
-    assert(ngrams_num_ >= ngrams_saved_num_);
-    if (ngrams_num_ != ngrams_saved_num_) {
-      KALDI_WARN << "num mismatch in arpa header: "<<ngrams_num_<<" "<<ngrams_saved_num_;
-    }
-    KALDI_VLOG(2) << max_collision_;
-  }
-
-  ~FasterArpaLm() {
-    if (is_built_) Free();
-  }
-
-  int32 BosSymbol() const { return bos_symbol_; }
-  int32 EosSymbol() const { return eos_symbol_; }
-  int32 UnkSymbol() const { return unk_symbol_; }
-  int32 NgramOrder() const { return ngram_order_; }
-
-  inline int64 GetHashedIdx(const int32* word_ids, 
-      int query_ngram_order, RAND_TYPE *h_value=NULL) const {
-    assert(query_ngram_order > 0 && query_ngram_order <= ngram_order_);
-    int32 ngram_order = query_ngram_order;
-    RAND_TYPE hashed_idx;
-    if (ngram_order == 1) {
-      hashed_idx = word_ids[ngram_order-1];
-    } else {
-      hashed_idx=randint_per_word_gram_[0][word_ids[0]];
-      for (int i=1; i<ngram_order; i++) {
-        int word_id=word_ids[i];
-        hashed_idx ^= randint_per_word_gram_[i][word_id];
-      }
-      if (h_value) *h_value = hashed_idx; // to check colid, h_value should be precise
-      int i = ngram_order-1;
-      hashed_idx &= 
-          (ngrams_hashed_size_[i]-ngrams_hashed_size_[i-1] - 1);
-      hashed_idx += ngrams_hashed_size_[i-1];
-    }
-    return hashed_idx;
-  }
-  inline void InsertHash(int64 hashed_idx, int64 ngrams_saved_num_) {
-    assert(hashed_idx < ngrams_map_.size());
-    if (ngrams_map_.at(hashed_idx)) {
-      LmState *lm_state = ngrams_map_[hashed_idx];
-      int32 cnt=0;
-      while (lm_state->next) {
-        lm_state = lm_state->next;
-        cnt++;
-      }
-      lm_state->next = &ngrams_[ngrams_saved_num_];
-      max_collision_=std::max(cnt,max_collision_);
-    } else {
-      ngrams_map_[hashed_idx] = &ngrams_[ngrams_saved_num_];
-      assert(ngrams_saved_num_ < ngrams_num_);
-    }
-  }
-  inline void SaveHashedState(const int32* word_ids, 
-      int query_ngram_order, LmState &lm_state_pattern) {
-    RAND_TYPE h_value=0;
-    int64 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
-    lm_state_pattern.h_value = h_value;
-    int32 ngram_order = query_ngram_order;
-    if (ngram_order == 1) {
-      ngrams_[hashed_idx] = lm_state_pattern;
-      ngrams_[hashed_idx].SaveWordIds(word_ids, ngram_order);
-    } else {
-      ngrams_[ngrams_saved_num_] = lm_state_pattern;
-      ngrams_[ngrams_saved_num_].SaveWordIds(word_ids, ngram_order);
-      InsertHash(hashed_idx, ngrams_saved_num_++);
-    }
-  }
-  inline void SaveHashedState(const std::vector<int32> &word_ids, LmState &lm_state_pattern,
-       bool reverse = false, int query_ngram_order = 0)  {
-    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
-    int32 word_ids_arr[MAX_NGRAM];
-    if (reverse)
-      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
-    else
-      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
-
-    return SaveHashedState(word_ids_arr, ngram_order, lm_state_pattern);
-  }
-
-  inline void SaveRecoState() {
-    for (int i=0; i<ngrams_num_; i++) {
-      int32 *word_ids = ngrams_[i].word_ids_;
-      int32 ngram_order = ngrams_[i].ngram_order_;
-      StateId lm_state_idx;
-      if (ngram_order > ngram_order_-1) {
-        ngram_order--;
-        while(!GetHashedState(word_ids, ngram_order, &lm_state_idx)) ngram_order--;
-        assert(ngram_order>0);
-      } else lm_state_idx = i; 
-      ngrams_[i].lm_state_reco_ = lm_state_idx;
-    }
-  }
-
-  inline const LmState* GetHashedState(const int32* word_ids, 
-      int query_ngram_order, StateId *lm_state_idx=NULL) const {
-    RAND_TYPE h_value;
-    LmState *ret_lm_state = NULL;
-    int64 hashed_idx = GetHashedIdx(word_ids, query_ngram_order, &h_value);
-    int32 ngram_order = query_ngram_order;
-    if (ngram_order == 1) {
-      ret_lm_state = &ngrams_[hashed_idx];
-    } else {
-      assert(hashed_idx < ngrams_map_.size());
-      LmState *lm_state = ngrams_map_[hashed_idx];
-      while (lm_state) {
-        if (lm_state->h_value == h_value) {
-          ret_lm_state = lm_state;
-          break;
-        }
-        lm_state = lm_state->next;
-      }
-    }
-    if (ret_lm_state && lm_state_idx) *lm_state_idx = ret_lm_state - ngrams_;
-   
-    // not found, can be bug or really not found the corresponding ngram 
-    return ret_lm_state;
-  }
-  inline const LmState* GetHashedState(const std::vector<int32> &word_ids, 
-       bool reverse = false, int query_ngram_order = 0) const {
-    int32 ngram_order = query_ngram_order==0? word_ids.size(): query_ngram_order;
-    int32 word_ids_arr[MAX_NGRAM];
-    if (reverse)
-      for (int i=0; i<ngram_order;i++) word_ids_arr[ngram_order - i - 1]=word_ids[i];
-    else
-      for (int i=0; i<ngram_order;i++) word_ids_arr[i]=word_ids[i];
-    return GetHashedState(word_ids_arr, ngram_order);
-  }
-
-  // if exist, get logprob_, else get backoff_logprob_
-  // memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
-  inline void GetWordIdsByLmStateIdx(int32 **word_ids, 
-      int32 *word_ngram_order, int64 lm_state_idx) const {
-    *word_ids = ngrams_[lm_state_idx].word_ids_;
-    *word_ngram_order = ngrams_[lm_state_idx].ngram_order_;
-  }
-
-  inline float GetNgramLogprob(const int32 *word_ids, 
-      const int32 word_ngram_order, 
-      StateId *lm_state_idx) const {
-    float prob;
-    int32 ngram_order = word_ngram_order;
-    assert(ngram_order > 0);
-    if (ngram_order > ngram_order_) {
-      //while (wseq.size() >= lm_.NgramOrder()) {
-      // History state has at most lm_.NgramOrder() -1 words in the state.
-      // wseq.erase(wseq.begin(), wseq.begin() + 1);
-      //}
-      // we don't need to do above things as we do in reverse fashion:
-      //  memcpy(n_wids+1, wids, len(wids)); n_wids[0] = cur_wrd;
-      ngram_order = ngram_order_;
-    }
-
-    const LmState *lm_state = GetHashedState(word_ids, ngram_order, lm_state_idx);
-    if (lm_state) { //found out
-      if (!lm_state->IsExist()) return std::numeric_limits<float>::min();
-      //assert(ngram_order==1 || GetHashedState(word_ids, ngram_order-1)->IsExist());
-      prob = lm_state->logprob_;
-     
-/* 
-      for (int i=0; i<ngram_order; i++) {
-        std::cout<<word_ids[i]<<" ";
-      }
-      std::cout<<ngram_order<<" "<<prob<<"\n";
-  */   
-#define IMPROVE_RECOMBINE
-#ifdef IMPROVE_RECOMBINE
-#if 1
-      *lm_state_idx = lm_state->lm_state_reco_;
-#else
-      if (ngram_order > ngram_order_-1) {
-        ngram_order = ngram_order_-1;
-        // below code is to make sure the LmState exist, so un-exist states can be recombined to a same state; 
-        // however, it wastes some hashing if we never use the nextstate
-        while(!GetHashedState(word_ids, ngram_order, lm_state_idx)) ngram_order--;
-        assert(ngram_order>0);
-      }
-#endif
-#endif
-    } else {
-      assert(ngram_order > 1); // thus we can do backoff
-      const LmState *lm_state_bo = GetHashedState(word_ids + 1, ngram_order-1); 
-
-      //assert(lm_state_bo && lm_state_bo->IsExist()); // TODO: assert will fail because some place has false-exist? 84746 4447 8537 without 4447 8537 in LM
-
-      prob = lm_state_bo? lm_state_bo->backoff_logprob_:0;
-      prob += GetNgramLogprob(word_ids, ngram_order - 1, lm_state_idx);
-    }
-    return prob;
-  }
-
-  bool BuildFasterArpaLm(const std::string& arpa_rxfilename, float lm_scale) {
-    FasterArpaLmBuilder lm_builder(options_, this, lm_scale);
-    KALDI_VLOG(1) << "Reading " << arpa_rxfilename;
-    Input ki(arpa_rxfilename);
-    lm_builder.Read(ki.Stream());
-    return true;
-  }
-
- private:
-  void Allocate(const std::vector<int32>& ngram_count, 
-                int32 bos_symbol, int32 eos_symbol, 
-                int32 unk_symbol) {
-    bos_symbol_ = bos_symbol;
-    eos_symbol_ = eos_symbol;
-    unk_symbol_ = unk_symbol;
-    ngram_order_ = ngram_count.size();
-    srand(0);
-    randint_per_word_gram_ = (RAND_TYPE **)malloc(ngram_order_ * sizeof(void*));
-    ngrams_hashed_size_ = (int64*)malloc(ngram_order_ * sizeof(int64));
-    int64 acc=0;
-    int64 acc_hashed=0;
-    for (int i=0; i< ngram_order_; i++) {
-      if (i == 0) ngrams_hashed_size_[i] = symbol_size_; // uni-gram
-      else {
-        ngrams_hashed_size_[i] = ((int64)1<<(int64)ceil(log(ngram_count[i]) / 
-                                 M_LN2 + HASH_REDUNDANT));
-      }
-      assert(ngram_count[i] >= 0);
-      KALDI_VLOG(2) << "ngram: "<< i+1 <<" hashed_size/size = "<< 
-        1.0 * ngrams_hashed_size_[i] / ngram_count[i]<<" "<<ngram_count[i];
-      assert(ngrams_hashed_size_[i] >= 0);
-      randint_per_word_gram_[i] = (RAND_TYPE* )malloc(symbol_size_ * sizeof(RAND_TYPE)) ;
-      for (int j=0; j<symbol_size_; j++) {
-        randint_per_word_gram_[i][j] = RandInt64(); 
-      }
-      acc+= i==0? ngrams_hashed_size_[i]:ngram_count[i];
-      acc_hashed+= ngrams_hashed_size_[i];
-      if (i==0) ngrams_hashed_size_[i]=0;
-      else ngrams_hashed_size_[i]+=ngrams_hashed_size_[i-1];
-    }
-    hash_size_except_uni_ = acc_hashed - symbol_size_;
-    assert(ngrams_hashed_size_[ngram_order_-1]==hash_size_except_uni_);
-    KALDI_VLOG(2) << " hashed_size/size = "<< 
-        1.0 * (hash_size_except_uni_+symbol_size_) / acc <<" "<<acc;
-    
-    ngrams_ = (LmState* )calloc(sizeof(LmState), acc); //use default constructo
-    ngrams_num_ = acc;
-    ngrams_saved_num_ = symbol_size_; // assume uni-gram is allocated
-    assert(hash_size_except_uni_ >= 0);
-    ngrams_map_.resize(hash_size_except_uni_, NULL);
-    is_built_ = true;
-  }
-  void Free() {
-    for (int i=0; i< ngram_order_; i++) {
-      free(randint_per_word_gram_[i]);
-    }
-    free(randint_per_word_gram_);
-    free(ngrams_);
-  }
-
- private:
-  // configurations
-
-  // Indicating if FasterArpaLm has been built or not.
-  bool is_built_;
-  // Integer corresponds to <s>.
-  int32 bos_symbol_;
-  // Integer corresponds to </s>.
-  int32 eos_symbol_;
-  // Integer corresponds to unknown-word. -1 if no unknown-word symbol is
-  // provided.
-  int32 unk_symbol_;  
-  // N-gram order of language model. This can be figured out from "/data/"
-  // section in Arpa format language model.
-  int32 ngram_order_;
-  int32 symbol_size_;
-  // Index of largest word-id plus one. It defines the end of <unigram_states_>
-  // array.
-  int32 num_words_;
-  // Size of the <lm_states_> array, which will be needed by I/O.
-  int64 lm_states_size_;
-  // Hash table from word sequences to LmStates.
-  ArpaParseOptions &options_;
-
-  // data
-
-  // Memory blcok for storing N-gram; ngrams_[ngram_order][hashed_idx]
-  LmState* ngrams_;
-  int64 ngrams_saved_num_;
-  int64 ngrams_num_;
-
-  std::vector<LmState *> ngrams_map_; // hash to ngrams_ index
-  // used to obtain hash value; randint_per_word_gram_[ngram_order][word_id]
-  RAND_TYPE** randint_per_word_gram_;
-  int64* ngrams_hashed_size_; //after init, it's an accumulate value
-  int64 hash_size_except_uni_;
-  int32 max_collision_;
-};
-
-
-/**
- This class wraps a FasterArpaLm format language model with the interface defined
- in DeterministicOnDemandFst.
- */
-class FasterArpaLmDeterministicFst
-  : public fst::DeterministicOnDemandFst<fst::StdArc> {
- public:
-  typedef fst::StdArc::Weight Weight;
-  typedef fst::StdArc::StateId StateId;
-  typedef fst::StdArc::Label Label;
-  typedef FasterArpaLm::LmState LmState;
-
-  explicit FasterArpaLmDeterministicFst(const FasterArpaLm& lm): 
-    start_state_(0), lm_(lm) { 
-      // TODO
-    // Creates a history state for <s>.
-    int32 word_ids = lm_.BosSymbol();
-    lm_.GetNgramLogprob(&word_ids, 1, &start_state_);
-  }
-
-  // We cannot use "const" because the pure virtual function in the interface is
-  // not const.
-  virtual StateId Start() { return start_state_; }
-
-  // We cannot use "const" because the pure virtual function in the interface is
-  // not const.
-  virtual Weight Final(StateId s) {
-    // At this point, we should have created the state.
-    StateId lm_state_idx;
-    float logprob = GetNgramLogprob(s, lm_.EosSymbol(), &lm_state_idx);
-    return Weight(-logprob);
-  }
-
-  float GetNgramLogprob(const int64 pre_lm_state_idx, int32 ilabel,
-      StateId *lm_state_idx) {
-    int32 *wseq;
-    int32 wseq_order;
-    lm_.GetWordIdsByLmStateIdx(&wseq, &wseq_order, pre_lm_state_idx);
-    int32 n = wseq_order;
-    if (n<=0) return std::numeric_limits<float>::min();
-    int32 word_ids[MAX_NGRAM];
-    assert(n+1 <= MAX_NGRAM);
-
-    word_ids[0] = ilabel;
-    for (int i=0; i<n; i++ ) {
-      word_ids[i+1] = wseq[i];
-    }
-
-    return lm_.GetNgramLogprob(word_ids, n+1, lm_state_idx);
-  }
-  virtual bool GetArc(StateId s, Label ilabel, fst::StdArc* oarc) {
-    // At this point, we should have created the state.
-
-    StateId lm_state_idx;
-    float logprob = GetNgramLogprob(s, ilabel, &lm_state_idx);
-    if (logprob == std::numeric_limits<float>::min()) {
-      return false;
-    }
-
-    // Creates the arc.
-    oarc->ilabel = ilabel;
-    oarc->olabel = ilabel;
-    oarc->nextstate = lm_state_idx;
-    oarc->weight = Weight(-logprob);
-
-    return true;
-  }
-
- private:
-  StateId start_state_;
-
-  const FasterArpaLm& lm_;
-};
-
-
-}  // namespace kaldi
-
-#endif  // KALDI_LM_CONST_ARPA_LM_H_
diff --git a/tools/Makefile b/tools/Makefile
index 0af1da80c87..094a9b608d3 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -11,7 +11,7 @@ OPENFST_VERSION ?= 1.6.7
 CUB_VERSION ?= 1.8.0
 
 # Default features configured for OpenFST; can be overridden in the make command line.
-OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts --enable-lookahead-fsts 
+OPENFST_CONFIGURE ?= --enable-static --enable-shared --enable-far --enable-ngram-fsts
 
 OPENFST_VER_NUM := $(shell echo $(OPENFST_VERSION) | sed 's/\./ /g' | xargs printf "%d%02d%02d")
 ifeq ("$(shell expr $(OPENFST_VER_NUM) \< 10600)","1")

From c8e5b59b15cb36ad30a0bfe933d96da2da2846ff Mon Sep 17 00:00:00 2001
From: QIU Shuo <xuexuexun@gmail.com>
Date: Mon, 12 Aug 2019 12:05:34 +0800
Subject: [PATCH 93/93] cosmetic changes

---
 egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
index 1ec4a0d575b..60d620b06c6 100755
--- a/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
+++ b/egs/mini_librispeech/s5/local/grammar/extend_vocab_demo.sh
@@ -308,8 +308,8 @@ if [ $stage -le 8 ]; then
 
 
 
-#  grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
-#%WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_12_0.0# s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
+ #  s5: grep WER exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_* | utils/best_wer.sh
+ # %WER 11.79 [ 2375 / 20138, 195 ins, 343 del, 1837 sub ] exp/chain/tdnn1h_sp/decode_tgsmall_dev_clean_2_ev_nosp_comb/wer_12_0.0
 
  #.. versus the baseline below note, the baseline is not 100% comparable as it used the
  #   silence probabilities, which the grammar-decoding does not (yet) support...