From 62c48b20af620ac8591ac1c7655e618dd453dbef Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@b02.clsp.jhu.edu>
Date: Mon, 23 Jan 2017 15:58:57 -0500
Subject: [PATCH 1/5] sample a word version1 (IO is written by myself)

---
 src/rnnlm/sample_a_word.cc | 432 +++++++++++++++++++++++++++++++++++++
 src/rnnlm/sample_a_word.h  | 159 ++++++++++++++
 2 files changed, 591 insertions(+)
 create mode 100644 src/rnnlm/sample_a_word.cc
 create mode 100644 src/rnnlm/sample_a_word.h
diff --git a/src/rnnlm/sample_a_word.cc b/src/rnnlm/sample_a_word.cc
new file mode 100644
index 00000000000..8761291d525
--- /dev/null
+++ b/src/rnnlm/sample_a_word.cc
@@ -0,0 +1,432 @@
+// sample_a_word.cc
+
+#include "sample_a_word.h"
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <math.h>
+
+// Constructor for sampling the next word
+NgramModel::NgramModel(char* arpa_file, char* histories_file) {
+  vocab_size_ = 0;
+  ReadARPAModel(arpa_file);
+  ReadHistories(histories_file);
+}
+
+// Read language model from a ARPA-format file.
+void NgramModel::ReadARPAModel(char* file) {
+  std::ifstream data_input(file);
+  if (!data_input.is_open()) {
+    std::cerr << "error opening '" << file
+              << "' for reading\n";
+    exit(1);
+  }
+  std::string line;
+  int32 order;
+  int32 order_current = 0;
+  int32 word;
+  int32 iter = 0;
+  int32 while_iter = 0;
+  std::pair <float, float> probs_pair;
+  float log_prob;
+  float backoff_weight;
+  bool unigram_check = false;
+  std::cout << "Start reading ARPA-format file..." << std::endl;
+  while (getline(data_input, line)) {
+    std::istringstream is(line);
+    // get the strings splitted by single space
+    // brace-initialization with C++11
+    std::istream_iterator<std::string> begin(is), end;
+    std::vector<std::string> tokens(begin, end);
+    if (tokens.size() == 0) continue;
+    if (tokens.size() == 2 && tokens[0] == "ngram") {
+      std::string substring = tokens[1].substr(2);
+      int32 count = std::stoi(substring); // get "123456" from "1=123456"
+      counts_.push_back(count);
+      order = std::stoi(tokens[1].substr(0));
+      continue;
+    }
+    if (tokens.size() == 1 && tokens[0] == "\\1-grams:") {
+      ngram_order_ = order; // ngram_order
+      probs_.resize(ngram_order_);
+      std::cout << "Ngram order is: " << ngram_order_ << std::endl;
+    }
+    // read current order
+    if (tokens.size() == 1 && tokens[0] != "\\data\\" &&
+        tokens[0] != "\\end\\") {
+      order_current = std::stoi(tokens[0].substr(1,1));
+      continue; // get the order info and skip processing this line
+    }
+    // read vocab and initialize probs of unigrams
+    if (order_current == 1) {
+      std::string word_s;
+      if (tokens.back() != "</s>") {
+        word_s = tokens.end()[-2];
+        backoff_weight = std::stof(tokens.back());
+      } else {
+        word_s = tokens.back();
+        backoff_weight = 0;
+      }
+      word = iter;
+      vocab_.insert({word_s, word});
+      iter++;
+      vocab_size_++;
+      if (iter == counts_[0]) {
+        bool unigram_check = true;
+        std::cout << "vocab size: " << vocab_size_ << std::endl;
+      }
+      HistType history;
+      history.resize(0);
+      log_prob = std::stof(tokens[0]);
+      probs_pair = std::make_pair(log_prob, backoff_weight);
+      probs_[order_current - 1][history].insert({word, probs_pair});
+      continue;
+    } 
+    // read each ngram and its log-probs and back-off weights
+    // read probs of order 1 to N - 1
+    if (order_current < ngram_order_ && order_current > 1) {
+      // case1: backoff_weights exist
+      if ((tokens.size() > order_current + 1) && (tokens.back() != "</s>") && tokens[0] != "ngram") {
+        // get the integer for word, the last second string in tokens
+        std::string second_last = tokens.end()[-2];
+        unordered_map<std::string, int32>::iterator it = vocab_.find(second_last);
+        if (it != vocab_.end()) {
+          word = it->second;
+        } else {
+          std::cout << "OOV word found: " << tokens.end()[-2] << std::endl;
+        }
+        int32 len_hist = tokens.size() - 3; // exclude the word, log-prob, and bow
+        HistType history;
+        for (int32 i = 1; i < len_hist + 1; i++) {
+          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
+          if (it != vocab_.end()) {
+            history.push_back(it->second);
+          } else {
+            std::cout << "OOV found in history: " << tokens[i] << std::endl;
+          }
+        }
+        assert (history.size() == order_current - 1);
+        log_prob = std::stof(tokens[0]);
+        backoff_weight = std::stof(tokens.back());
+        probs_pair = std::make_pair(log_prob, backoff_weight);
+        probs_[order_current - 1][history].insert({word, probs_pair});
+        continue;
+      }
+      // case2: no backoff_weights
+      if (tokens.size() == order_current + 1 && (tokens.back() == "</s>") && tokens[0] != "ngram") {
+        unordered_map<std::string, int32>::iterator it = vocab_.find(tokens.back());
+        if (it != vocab_.end()) {
+          word = it->second;
+        } 
+        int32 len_hist = tokens.size() - 2; // exclude the word and log-prob
+        HistType history;
+        assert (len_hist > 0);
+        for (int32 i = 1; i < len_hist + 1; i++) {
+          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
+          if (it != vocab_.end()) {
+            history.push_back(it->second);
+          } else {
+            std::cout << "OOV found in history: " << tokens[i] << std::endl;
+          }
+        }
+        assert (history.size() == order_current - 1);
+        log_prob = std::stof(tokens[0]);
+        backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff)
+        probs_pair = std::make_pair(log_prob, backoff_weight);
+        probs_[order_current - 1][history].insert({word, probs_pair});
+        continue;
+      }
+    } else if (order_current == ngram_order_) { // read probs of order N
+      if (tokens.size() > 2) {
+        std::string word_s = tokens.back();
+        unordered_map<std::string, int32>::iterator it = vocab_.find(word_s);
+        if (it != vocab_.end()) {
+          word = it->second;
+        }
+        int32 len_hist = tokens.size() - 2; // exclude the word and log-prob
+        HistType history;
+        assert (len_hist > 0);
+        for (int32 i = 1; i < len_hist + 1; i++) {
+          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
+          if (it != vocab_.end()) {
+            history.push_back(it->second);
+          } else {
+            std::cout << "OOV found in history: " << tokens[i] << std::endl;
+          }
+        }
+        log_prob = std::stof(tokens[0]);
+        backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff)
+        probs_pair = std::make_pair(log_prob, backoff_weight);
+        probs_[order_current - 1][history].insert({word, probs_pair});
+        continue;
+      }
+    }
+  }
+  std::cout << "Finish reading ARPA-format file." << std::endl;
+}
+
+float NgramModel::GetProb(int32 order, const int32 word, const HistType& history) {
+  float prob = 0.0;
+  auto it = probs_[order - 1].find(history);
+  if (it != probs_[order - 1].end() &&
+      probs_[order-1][history].find(word) != probs_[order-1][history].end()) {
+    prob += probs_[order-1][history][word].first;
+  } else { // backoff to the previous order
+    order--;
+    if (order >= 1) {
+      HistType::const_iterator first = history.begin() + 1;
+      HistType::const_iterator last = history.end();
+      HistType h(first, last);
+      prob += GetProb(order, word, h);
+      int32 word_new = history.back();
+      HistType::const_iterator last_new = history.end() - 1;
+      HistType h_new(history.begin(), last_new);
+      prob += GetBackoffWeight(order, word_new, h_new);
+    }
+  }
+  return prob;
+}
+
+float NgramModel::GetBackoffWeight(int32 order, const int32 word, const HistType& history) {
+  float bow = 0.0;
+  auto it = probs_[order - 1].find(history);
+  if (it != probs_[order - 1].end()) {
+    auto it2 = probs_[order - 1][history].find(word);
+    if (it2 != probs_[order - 1][history].end()) {
+      bow = (it2->second).second;
+    }
+  }
+  return bow;
+}
+
+void NgramModel::ComputeWordPdf(const HistType& history, std::vector<float>* pdf) {
+  int32 order = history.size();
+  float prob = 0.0;
+  for (int32 i = 0; i < vocab_size_; i++) {
+    auto it = probs_[order].find(history);
+    int32 word = i;
+    if (it != probs_[order].end()) {
+      auto it2 = probs_[order][history].find(word);
+      if (it2 != probs_[order][history].end()) {
+        prob = pow(10, (it2->second).first);
+        (*pdf).push_back(prob);
+      } else {
+        HistType::const_iterator first = history.begin() + 1;
+        HistType::const_iterator last = history.end();
+        HistType h(first, last);
+        int32 word_new = history.back();
+        HistType::const_iterator last_new = history.end() - 1;
+        HistType h_new(history.begin(), last_new);
+        prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
+               pow(10, GetProb(order, word, h));
+        (*pdf).push_back(prob);
+      }
+    } else {
+      HistType::const_iterator first = history.begin() + 1;
+      HistType::const_iterator last = history.end();
+      HistType h(first, last);
+      int32 word_new = history.back();
+      HistType::const_iterator last_new = history.end() - 1;
+      HistType h_new(history.begin(), last_new);
+      prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
+             pow(10, GetProb(order, word, h));
+      (*pdf).push_back(prob);
+    }
+  }
+}
+
+// Get history weights
+void NgramModel::ComputeHistoriesWeights() {
+  for (auto it = histories_.begin(); it != histories_.end(); ++it) {
+    HistType history(*(it));
+    assert(history.size() <= ngram_order_);
+    for (int32 i = 0; i < history.size() + 1; i++) {
+      HistType h_tmp = history;
+      float prob = 1.0 / histories_.size();
+      while (h_tmp.size() > (history.size() - i)) {
+        HistType::iterator last = h_tmp.end() - 1;
+        HistType h(h_tmp.begin(), last);
+        int32 word = h_tmp.back();
+        prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h));
+        h_tmp = h;
+      }
+      HistType::iterator begin = history.begin() + i;
+      HistType h(begin, history.end());
+      hists_weights_[h] += prob;
+    }
+  } 
+  std::cout << "Size of hists_weights_ is: " << hists_weights_.size() << std::endl;
+}
+
+// Get weighted pdf
+void NgramModel::ComputeWeightedPdf(std::vector<float>* pdf_w) {
+  float prob = 0;
+  (*pdf_w).resize(vocab_size_); // if do not do this, (*pdf_w)[word] += prob will get seg fault
+  for (int32 i = 0; i < vocab_size_; i++) {
+    for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) {
+      HistType h(it->first);
+      int32 order = h.size();
+      auto it_hist = probs_[order].find(h);
+      if (it_hist != probs_[order].end()) {
+        int32 word = i;
+        auto it_word = probs_[order][h].find(word);
+        if (it_word != probs_[order][h].end()) {
+          if (order > 0) {
+            HistType::iterator last = h.end() - 1;
+            HistType::iterator first = h.begin() + 1;
+            HistType h1(h.begin(), last);
+            HistType h2(first, h.end());
+            prob = (it->second) * (pow(10, probs_[order][h][word].first) - 
+                    pow(10, GetBackoffWeight(order, h.back(), h1))
+                    * pow(10, GetProb(order, word, h2)));
+            (*pdf_w)[word] += prob;
+          } 
+          else {
+            prob = (it->second) * pow(10, probs_[order][h][word].first);
+            (*pdf_w)[word] += prob;
+          }
+        }
+      }
+    } // end reading history
+  } // end reading words
+}
+
+// sample a word that follows a pdf
+int32 NgramModel::SampleWord(const std::vector<float>& pdf) {
+  // generate a cdf from the given pdf 
+  std::vector<std::pair<float, float> > cdf;
+  float upper = 0;
+  float lower = 0;
+  std::pair<float, float> probs;
+  for (int32 i = 0; i < pdf.size(); i++) {
+    upper += pdf[i];
+    lower = upper - pdf[i];
+    probs = std::make_pair(lower, upper);
+    cdf.push_back(probs);
+  }
+  float u = 1.0 * rand()/RAND_MAX;
+  for (int32 i = 0; i < cdf.size(); i++) {
+    if (cdf[i].first <= u < cdf[i].second) {
+      return i;
+    }
+  }
+}
+
+// Sampling a word
+void NgramModel::TestSampling(int32 iters) {
+  ComputeHistoriesWeights();
+  std::vector<float> pdf;
+  ComputeWeightedPdf(&pdf);
+
+  // Compute diff
+  std::vector<float> pdf_est;
+  pdf_est.resize(vocab_size_);
+  int32 word;
+  int32 count_nons = 0;
+  for (int32 i = 0; i < iters; i++) {
+    word = SampleWord(pdf);
+    if (word > vocab_size_ || word < 0) {
+      std::cout << "the next word is " << word << std::endl;
+      count_nons += 1;
+      continue;
+    } else {
+      pdf_est[word] += 1.0;
+    }
+  }
+  // normalization
+  float ed = 0;
+  for (int32 i = 0; i < vocab_size_; i++) {
+    pdf_est[word] /= iters;
+    ed += pow(pdf_est[word] - pdf[word], 2); 
+  }
+  ed = pow(ed, 0.5);
+  std::cout << "Run " << iters << " times, e distance (expect < 0.05) is " << ed << std::endl;
+  std::cout << "Number of words OOV : " << count_nons << std::endl;
+}
+
+// Test the read-in language model
+void NgramModel::TestReadingModel() {
+  std::cout << "Testing model reading part..."<< std::endl;
+  std::cout << "Vocab size is: " << vocab_size_ << std::endl;
+  std::cout << "Ngram_order is: " << ngram_order_ << std::endl;
+  assert(probs_.size() == counts_.size());
+  for (int32 i = 0; i < ngram_order_; i++) {
+    int32 size_ngrams = 0;
+    std::cout << "Test: for order " << (i + 1) << std::endl;
+    std::cout << "Expected number of " << (i + 1) << "-grams: " << counts_[i] << std::endl;
+    for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) {
+      HistType h(it1->first);
+      for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) {
+        size_ngrams++; // number of words given
+      }
+    }
+    std::cout << "Read in number of " << (i + 1) << "-grams: " << size_ngrams << std::endl;
+  }
+  std::cout << "Assert sum of unigram probs equal to 1..." << std::endl;
+  float prob_sum = 0.0;
+  int32 count = 0;
+  for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) {
+    HistType h(it1->first);
+    for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) {
+      prob_sum += 1.0 * pow(10.0, (it2->second).first);
+      count++;
+    }
+  }
+  std::cout << "Number of total words: " << count << std::endl;
+  std::cout << "Sum of unigram probs equal to " << prob_sum << std::endl;
+
+  std::cout << "Assert sum of bigram probs given a history equal to 1..." << std::endl;
+  prob_sum = 0.0;
+  auto it1 = probs_[1].begin();
+  HistType h(it1->first);
+  for (auto it = vocab_.begin(); it != vocab_.end(); ++it) {
+    auto it2 = probs_[1][h].find(it->second);
+    if (it2 != probs_[1][h].end()) {
+      prob_sum += 1.0 * pow(10, (it2->second).first);
+    } else {
+      prob_sum += pow(10, GetProb(2, it->second, h));
+    }
+  }
+  std::cout << "Sum of bigram probs given a history equal to " << prob_sum << std::endl;
+
+}
+
+// Read histories of integers from a file
+void NgramModel::ReadHistories(char* file) {
+ std::ifstream data_input(file);
+  if (!data_input.is_open()) {
+    std::cerr << "error opening '" << file
+              << "' for reading\n";
+    exit(1);
+  }
+  std::string line;
+  std::cout << "Start reading histories..." << std::endl;
+  while (getline(data_input, line)) {
+    std::istringstream is(line);
+    std::istream_iterator<std::string> begin(is), end;
+    std::vector<std::string> tokens(begin, end);
+    HistType history;
+    int32 word;
+    for (int32 i = 0; i < tokens.size(); i++) {
+      auto it = vocab_.find(tokens[i]);
+      if (it != vocab_.end()) {
+        word = it->second;
+      } else {
+        std::string word_s = "<unk>";
+        auto it_unk = vocab_.find(word_s);
+        assert (it_unk != vocab_.end());
+        word = it_unk->second;
+      }
+      history.push_back(word);
+    }
+    if (history.size() >= ngram_order_) {
+      // TODO: try slicing it later
+      std::reverse(history.begin(), history.end());
+      history.resize(ngram_order_ - 1);
+      std::reverse(history.begin(), history.end());
+    }
+    histories_.push_back(history);
+  }
+  std::cout << "Finished reading histories." << std::endl;
+}
diff --git a/src/rnnlm/sample_a_word.h b/src/rnnlm/sample_a_word.h
new file mode 100644
index 00000000000..86fa4e1d4ee
--- /dev/null
+++ b/src/rnnlm/sample_a_word.h
@@ -0,0 +1,159 @@
+// sample_a_word.h
+
+// Copyright     2016  Ke Li
+
+// See ../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABILITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SAMPLE_A_WORD_H_
+#define SAMPLE_A_WORD_H_
+
+#include <sys/time.h>
+#include <unistd.h>
+
+#ifdef _MSC_VER
+#include <unordered_map>
+#include <unordered_set>
+using std::unordered_map;
+using std::unordered_set;
+#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
+#include <unordered_map>
+#include <unordered_set>
+using std::unordered_map;
+using std::unordered_set;
+#else
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+using std::tr1::unordered_map;
+using std::tr1::unordered_set;
+#endif
+
+#include <cassert>
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <queue>
+#include <limits>
+
+typedef int32_t int32;
+
+/// A hashing function-object for vectors of ints.
+struct IntVectorHasher {  // hashing function for vector<Int>.
+  size_t operator()(const std::vector<int32> &x) const {
+    size_t ans = 0;
+    typename std::vector<int32>::const_iterator iter = x.begin(), end = x.end();
+    for (; iter != end; ++iter) {
+      ans *= kPrime;
+      ans += *iter;
+    }
+    return ans;
+  }
+ private:
+  static const int kPrime = 7853;
+};
+
+typedef std::vector<int32> HistType;
+typedef unordered_map<int32, std::pair<float, float> > WordToProbsMap; 
+typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType;
+typedef unordered_map<HistType, float, IntVectorHasher> HistWeightsType;
+
+class Timer {
+ public:
+  Timer() { Reset(); }
+
+  void Reset() { gettimeofday(&this->time_start_, &time_zone_); }
+
+  /// Returns time in seconds.
+  double Elapsed() {
+    struct timeval time_end;
+    gettimeofday(&time_end, &time_zone_);
+    double t1, t2;
+    t1 =  static_cast<double>(time_start_.tv_sec) +
+          static_cast<double>(time_start_.tv_usec)/(1000*1000);
+    t2 =  static_cast<double>(time_end.tv_sec) +
+          static_cast<double>(time_end.tv_usec)/(1000*1000);
+    return t2-t1;
+  }
+
+ private:
+  struct timeval time_start_;
+  struct timezone time_zone_;
+};
+
+class NgramModel {
+ public:
+  // Constructor for testing
+  NgramModel(char* arpa_file, char* histories_file);
+  
+  void TestReadingModel();
+
+  void TestSampling(int32 iters);
+  
+ private:
+  // This function returns the log probability of a ngram term from the ARPA LM
+  // if it is found; it backoffs to the lower order model when the ngram term 
+  // does not exist.
+  float GetProb(int32 order, const int32 word, const HistType& history);
+
+  // Get the back-off weight of a ngram in the read-in model
+  float GetBackoffWeight(int32 order, const int32 word, const HistType& history);
+
+  // Compute a pdf of words in the vocab given a history
+  void ComputeWordPdf(const HistType& history, std::vector<float>* pdf);
+  
+  // Compute weights of given histories
+  void ComputeHistoriesWeights();
+  // Compute weighted pdf given all histories
+  void ComputeWeightedPdf(std::vector<float>* weighted_pdf);
+  
+  // Sample the next word
+  int32 SampleWord(const std::vector<float>& pdf);
+  
+  // Read the language model prob_ from stream
+  // Called from constructor; Check the sum of unigrams
+  void ReadARPAModel(char* arpa_file);
+  
+  void ReadHistories(char* file);
+
+  // N-gram order of the read-in LM.
+  int32 ngram_order_;
+  
+  // Counts of each ngram
+  std::vector<int32> counts_; 
+
+  // Vocab size
+  int32 vocab_size_;
+
+  // Vocab
+  unordered_map<std::string, int32> vocab_;
+   
+  // N-gram probabilities.
+  std::vector<NgramType> probs_;
+
+  // Histories' weights
+  HistWeightsType hists_weights_;
+  
+  // The given N Histories
+  std::vector<HistType> histories_;
+};
+
+#endif

From 7ae1bff644f12e3c1bd8f8302c3a93889fc0c2e2 Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@b02.clsp.jhu.edu>
Date: Tue, 7 Feb 2017 22:36:17 -0500
Subject: [PATCH 2/5] sample a word version 2 (use fst Symbol table and kaldi
 io)

---
 src/rnnlm/Makefile            |   7 +-
 src/rnnlm/arpa-sampling.cc    | 402 ++++++++++++++++++++++++++++++++++
 src/rnnlm/arpa-sampling.h     | 182 +++++++++++++++
 src/rnnlm/rnnlm-utils-test.cc |  38 +++-
 4 files changed, 622 insertions(+), 7 deletions(-)
 create mode 100644 src/rnnlm/arpa-sampling.cc
 create mode 100644 src/rnnlm/arpa-sampling.h

diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
index bd94149bdfa..48b49d61efb 100644
--- a/src/rnnlm/Makefile
+++ b/src/rnnlm/Makefile
@@ -10,15 +10,14 @@ TESTFILES = rnnlm-utils-test
 
 OBJFILES = rnnlm-component-itf.o rnnlm-utils.o  rnnlm-nnet.o  rnnlm-component.o nnet-parse.o \
            rnnlm-training.o \
-           rnnlm-diagnostics.o
-#           rnnlm-utils-test.o
-#           rnnlm-test-utils.o
+           rnnlm-diagnostics.o \
+           arpa-sampling.o \
 
 LIBNAME = kaldi-rnnlm
 
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../lm/kaldi-lm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
           ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \
           ../matrix/kaldi-matrix.a ../base/kaldi-base.a
diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc
new file mode 100644
index 00000000000..f512d11fbcb
--- /dev/null
+++ b/src/rnnlm/arpa-sampling.cc
@@ -0,0 +1,402 @@
+// arpa-sampling.cc
+
+#include "arpa-sampling.h"
+#include <iostream>
+#include <string>
+#include <iterator>
+#include <algorithm>
+#include <math.h>
+
+namespace kaldi {
+
+void ArpaSampling::ConsumeNGram(const NGram& ngram) {
+  int32 cur_order = ngram.words.size();
+  int32 word = ngram.words.back(); // word is the last word in vector words
+  HistType history(ngram.words.begin(), ngram.words.begin() + cur_order - 1);
+  KALDI_ASSERT(history.size() == cur_order - 1);
+
+  BaseFloat log_prob = ngram.logprob / M_LN10;
+  BaseFloat backoff_weight = ngram.backoff / M_LN10;
+  std::pair <BaseFloat, BaseFloat> probs_pair;
+  probs_pair = std::make_pair(log_prob, backoff_weight);
+  // update map
+  probs_[cur_order - 1][history].insert({word, probs_pair});
+ 
+  // get vocab_, the map from word string to integer
+  const fst::SymbolTable* sym = Symbols();
+  if (cur_order == 1) {
+    num_words_++;
+    std::string word_s = sym->Find(word);
+    std::pair<std::string, int32> word_pair;
+    word_pair = std::make_pair(word_s, word);
+    vocab_.push_back(word_pair);
+  }
+}
+
+void ArpaSampling::HeaderAvailable() {
+  ngram_counts_ = NgramCounts();
+  ngram_order_ = NgramCounts().size(); 
+  probs_.resize(ngram_order_);
+}
+
+BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history) {
+  BaseFloat prob = 0.0;
+  auto it = probs_[order - 1].find(history);
+  if (it != probs_[order - 1].end() &&
+      probs_[order-1][history].find(word) != probs_[order-1][history].end()) {
+    prob += probs_[order-1][history][word].first;
+  } else { // backoff to the previous order
+    order--;
+    if (order >= 1) {
+      HistType::const_iterator first = history.begin() + 1;
+      HistType::const_iterator last = history.end();
+      HistType h(first, last);
+      prob += GetProb(order, word, h);
+      int32 word_new = history.back();
+      HistType::const_iterator last_new = history.end() - 1;
+      HistType h_new(history.begin(), last_new);
+      prob += GetBackoffWeight(order, word_new, h_new);
+    }
+  }
+  return prob;
+}
+
+BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType& history) {
+  BaseFloat bow = 0.0;
+  auto it = probs_[order - 1].find(history);
+  if (it != probs_[order - 1].end()) {
+    auto it2 = probs_[order - 1][history].find(word);
+    if (it2 != probs_[order - 1][history].end()) {
+      bow = (it2->second).second;
+    }
+  }
+  return bow;
+}
+
+void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf) {
+  int32 order = history.size();
+  BaseFloat prob = 0.0;
+  for (int32 i = 0; i < num_words_; i++) {
+    auto it = probs_[order].find(history);
+    int32 word = vocab_[i].second; // get word from the map
+    if (it != probs_[order].end()) {
+      auto it2 = probs_[order][history].find(word);
+      if (it2 != probs_[order][history].end()) {
+        prob = pow(10, (it2->second).first);
+        (*pdf)[i].first = word;
+        (*pdf)[i].second += prob;
+      } else {
+        HistType::const_iterator first = history.begin() + 1;
+        HistType::const_iterator last = history.end();
+        HistType h(first, last);
+        int32 word_new = history.back();
+        HistType::const_iterator last_new = history.end() - 1;
+        HistType h_new(history.begin(), last_new);
+        prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
+ 
+          pow(10, GetProb(order, word, h));
+        (*pdf)[i].first = word;
+        (*pdf)[i].second += prob;
+      }
+    } else {
+      HistType::const_iterator first = history.begin() + 1;
+      HistType::const_iterator last = history.end();
+      HistType h(first, last);
+      int32 word_new = history.back();
+      HistType::const_iterator last_new = history.end() - 1;
+      HistType h_new(history.begin(), last_new);
+      prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
+             pow(10, GetProb(order, word, h));
+      (*pdf)[i].first = word;
+      (*pdf)[i].second += prob;
+    }
+  }
+}
+
+// Get history weights
+void ArpaSampling::ComputeHistoriesWeights() {
+  for (auto it = histories_.begin(); it != histories_.end(); ++it) {
+    HistType history(*(it));
+    KALDI_ASSERT(history.size() <= ngram_order_);
+    for (int32 i = 0; i < history.size() + 1; i++) {
+      HistType h_tmp = history;
+      BaseFloat prob = 1.0 / histories_.size();
+      while (h_tmp.size() > (history.size() - i)) {
+        HistType::iterator last = h_tmp.end() - 1;
+        HistType h(h_tmp.begin(), last);
+        int32 word = h_tmp.back();
+        prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h));
+        h_tmp = h;
+      }
+      HistType::iterator begin = history.begin() + i;
+      HistType h(begin, history.end());
+      hists_weights_[h] += prob;
+    }
+  }
+  KALDI_LOG << "Size of hists_weights_ is: " << hists_weights_.size();
+}
+
+// Get weighted pdf
+void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* pdf_w) {
+  BaseFloat prob = 0;
+  (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault
+  for (int32 i = 0; i < num_words_; i++) {
+    for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) {
+      HistType h(it->first);
+      int32 order = h.size();
+      auto it_hist = probs_[order].find(h);
+      if (it_hist != probs_[order].end()) {
+        int32 word = vocab_[i].second;
+        auto it_word = probs_[order][h].find(word);
+        if (it_word != probs_[order][h].end()) {
+          if (order > 0) {
+            HistType::iterator last = h.end() - 1;
+            HistType::iterator first = h.begin() + 1;
+            HistType h1(h.begin(), last);
+            HistType h2(first, h.end());
+            prob = (it->second) * (pow(10, probs_[order][h][word].first) - 
+                    pow(10, GetBackoffWeight(order, h.back(), h1))
+                    * pow(10, GetProb(order, word, h2)));
+            (*pdf_w)[i].first = word;
+            (*pdf_w)[i].second += prob;
+          } else {
+            prob = (it->second) * pow(10, probs_[order][h][word].first);
+            (*pdf_w)[i].first = word;
+            (*pdf_w)[i].second += prob;
+          }
+        }
+      }
+    } // end reading history
+  } // end reading words
+}
+
+// sample a word that follows a pdf
+int32 ArpaSampling::SampleWord(const std::vector<std::pair<int32, BaseFloat> >& pdf) {
+  // generate a cdf from the given pdf 
+  std::vector<std::pair<int32, BaseFloat> > cdf;
+  BaseFloat upper = 0;
+  int32 word;
+  std::pair<int32, BaseFloat> probs;
+  for (int32 i = 0; i < num_words_; i++) {
+    upper += pdf[i].second;
+    word = vocab_[i].second;
+    probs = std::make_pair(word, upper);
+    cdf.push_back(probs);
+  }
+  BaseFloat u = 1.0 * RandUniform();
+  if (u >= 0 && u < cdf[1].second) {
+    return cdf[0].first;
+  }
+  for (int32 i = 1; i < num_words_; i++) {
+    if (cdf[i - 1].second <= u && u < cdf[i].second) {
+      return cdf[i].first;
+    }
+  }
+  return -1;
+}
+
+// Sample a word
+void ArpaSampling::TestSampling() {
+  ComputeHistoriesWeights();
+  std::vector<std::pair<int32, BaseFloat> > pdf;
+  ComputeWeightedPdf(&pdf);
+  BaseFloat sum = 0;
+  for (int32 i = 0; i < num_words_; i++) {
+    sum += pdf[i].second;
+  }
+
+  // Check convergence 
+  unordered_map<int32, BaseFloat> pdf_est;
+  int32 word;
+  int32 count_nons = 0;
+  int32 count = 0;
+  for (int32 i = 0; ; i++) {
+    word = SampleWord(pdf);
+    if (word > num_words_ || word < 0) {
+      KALDI_LOG << "the next word is " << word;
+      count_nons += 1;
+      continue;
+    } else {
+      auto it = pdf_est.find(word);
+      if (it == pdf_est.end()) {
+        pdf_est.insert({word, 1.0});
+      } else {
+        pdf_est[word] += 1.0;
+      }
+    }
+    count++;
+    if (count % 1000 == 0) {
+      // normalization
+      BaseFloat ed = 0;
+      for (int32 i = 0; i < num_words_; i++) {
+        int32 word = vocab_[i].second;
+        pdf_est[word] /= count;
+        ed += pow(pdf_est[word] - pdf[i].second, 2); 
+      }
+      ed = pow(ed, 0.5);
+      // KALDI_LOG << "Run " << count << " times, Euclidean distance is " << ed;
+      if (ed <= 0.05) {
+        KALDI_LOG << "Run " << count << " times, Euclidean distance (expect <= 0.05) is " << ed;
+        break;
+      } 
+    }
+  }
+  KALDI_LOG << "Number of words OOV : " << count_nons;
+}
+
+// this function returns the log probability of the given sentence
+BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector<int32>& sentence) {
+  BaseFloat prob = 0;
+  const fst::SymbolTable* sym = Symbols();
+  for (int32 i = 1; i < sentence.size(); i++) {
+    if (i < ngram_order_ - 1) {
+      HistType::const_iterator last = sentence.begin() + i;
+      HistType h(sentence.begin(), last);
+      prob += GetProb(i + 1, sentence[i], h);
+    } else {
+      HistType::const_iterator first = sentence.begin() + i + 1 - ngram_order_;
+      HistType::const_iterator last = sentence.begin() + i;
+      HistType h(first, last);
+      KALDI_ASSERT(h.size() == ngram_order_ - 1);
+      prob += GetProb(ngram_order_, sentence[i], h);
+    }
+    std::string word_s = sym->Find(sentence[i]);
+    if (sentence[i] == kUnk) {
+      word_s = unk_symbol_;
+    }
+  }
+  return prob;
+}
+
+// this functions computes the total log probability of all test sentences
+BaseFloat ArpaSampling::ComputeAllSentencesProb(const std::vector<std::vector<int32> >& sentences) {
+  BaseFloat prob = 0;
+  for (int32 i = 0; i < sentences.size(); i++) {
+    KALDI_ASSERT(sentences[i].size() >= 3);
+    prob += ComputeSentenceProb(sentences[i]);
+  }
+  int32 len = sentences.size();
+  KALDI_LOG << "Total log-probabilities of " << len << " sentences are: "\
+    << prob;
+  return prob;
+}
+
+void ArpaSampling::PrintHist(const HistType& h) {
+  KALDI_LOG << "Current hist is: ";
+  for (int32 i = 0; i < h.size(); i++) {
+    KALDI_LOG << h[i] << " ";
+  }
+}
+
+// Test the read-in model by computing the total prob of given sentences
+void ArpaSampling::TestProbs(std::istream &is, bool binary) {
+  std::vector<std::vector<int32> > sentences;
+  ReadSentences(is, &sentences);
+  ComputeAllSentencesProb(sentences);
+}
+
+// Test the read-in language model
+void ArpaSampling::TestReadingModel() {
+  KALDI_LOG << "Testing model reading part..."<< std::endl;
+  KALDI_LOG << "Vocab size is: " << vocab_.size();
+  KALDI_LOG << "Ngram_order is: " << ngram_order_;
+  KALDI_ASSERT(probs_.size() == ngram_counts_.size());
+  for (int32 i = 0; i < ngram_order_; i++) {
+    int32 size_ngrams = 0;
+    KALDI_LOG << "Test: for order " << (i + 1);
+    KALDI_LOG << "Expected number of " << (i + 1) << "-grams: " << ngram_counts_[i];
+    for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) {
+      HistType h(it1->first);
+      for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) {
+        size_ngrams++; // number of words given
+      }
+    }
+    KALDI_LOG << "Read in number of " << (i + 1) << "-grams: " << size_ngrams;
+  }
+  KALDI_LOG << "Assert sum of unigram probs equal to 1...";
+  BaseFloat prob_sum = 0.0;
+  int32 count = 0;
+  for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) {
+    HistType h(it1->first);
+    for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) {
+      prob_sum += 1.0 * pow(10.0, (it2->second).first);
+      count++;
+    }
+  }
+  KALDI_LOG << "Number of total words: " << count;
+  KALDI_LOG << "Sum of unigram probs equal to " << prob_sum;
+
+  KALDI_LOG << "Assert sum of bigram probs given a history equal to 1...";
+  prob_sum = 0.0;
+  auto it1 = probs_[1].begin();
+  HistType h(it1->first);
+  for (int32 i = 0; i < num_words_; i++) {
+    auto it2 = probs_[1][h].find(vocab_[i].second);
+    if (it2 != probs_[1][h].end()) {
+      prob_sum += 1.0 * pow(10, (it2->second).first);
+    } else {
+      prob_sum += pow(10, GetProb(2, vocab_[i].second, h));
+    }
+  }
+  KALDI_LOG << "Sum of bigram probs given a history equal to " << prob_sum;
+}
+
+// Read sentences from a file
+void ArpaSampling::ReadSentences(std::istream &iss, std::vector<std::vector<int32> >* sentences) {
+  const fst::SymbolTable* sym = Symbols();
+  std::string line;
+  KALDI_LOG << "Start reading sentences...";
+  while (getline(iss, line)) {
+    std::istringstream is(line);
+    std::istream_iterator<std::string> begin(is), end;
+    std::vector<std::string> tokens(begin, end);
+    std::vector<int32> sentence;
+    int32 word;
+    int32 bos = sym->Find(bos_symbol_);
+    sentence.push_back(bos);
+    for (int32 i = 0; i < tokens.size(); i++) {
+      word = sym->Find(tokens[i]);
+      if (word == fst::SymbolTable::kNoSymbol) {
+        word = sym->Find(unk_symbol_);
+      }
+      sentence.push_back(word);
+    }
+    int32 eos = sym->Find(eos_symbol_);
+    sentence.push_back(eos);
+    (*sentences).push_back(sentence);
+  }
+  KALDI_LOG << "Finished reading sentences.";
+}
+
+// Read histories of integers from a file
+void ArpaSampling::ReadHistories(std::istream &is, bool binary) {
+  if (binary) {
+    KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
+  }
+  const fst::SymbolTable* sym = Symbols();
+  std::string line;
+  KALDI_LOG << "Start reading histories...";
+  while (getline(is, line)) {
+    std::istringstream is(line);
+    std::istream_iterator<std::string> begin(is), end;
+    std::vector<std::string> tokens(begin, end);
+    HistType history;
+    int32 word;
+    for (int32 i = 0; i < tokens.size(); i++) {
+      word = sym->Find(tokens[i]);
+      if (word == fst::SymbolTable::kNoSymbol) {
+        word = sym->Find(unk_symbol_);
+      }
+      history.push_back(word);
+    }
+    if (history.size() >= ngram_order_) {
+      std::reverse(history.begin(), history.end());
+      history.resize(ngram_order_ - 1);
+      std::reverse(history.begin(), history.end());
+    }
+    histories_.push_back(history);
+  }
+  KALDI_LOG << "Finished reading histories.";
+}
+
+} // end of kaldi
diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h
new file mode 100644
index 00000000000..bad3b08953f
--- /dev/null
+++ b/src/rnnlm/arpa-sampling.h
@@ -0,0 +1,182 @@
+// arpa_sampling.h
+
+// Copyright     2016  Ke Li
+
+// See ../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABILITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ARPA_SAMPLING_H_
+#define ARPA_SAMPLING_H_
+
+#include <sys/time.h>
+#include <unistd.h>
+#include "lm/arpa-file-parser.h"
+#include "fst/fstlib.h"
+
+#ifdef _MSC_VER
+#include <unordered_map>
+#include <unordered_set>
+using std::unordered_map;
+using std::unordered_set;
+#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
+#include <unordered_map>
+#include <unordered_set>
+using std::unordered_map;
+using std::unordered_set;
+#else
+#include <tr1/unordered_map>
+#include <tr1/unordered_set>
+using std::tr1::unordered_map;
+using std::tr1::unordered_set;
+#endif
+
+#include <cassert>
+#include <stdlib.h>
+#include <math.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <string>
+#include <sstream>
+#include <fstream>
+#include <vector>
+#include <iostream>
+#include <queue>
+#include <limits>
+
+namespace kaldi {
+
+typedef int32_t int32;
+
+/// A hashing function-object for vectors of ints.
+struct IntVectorHasher {  // hashing function for vector<Int>.
+  size_t operator()(const std::vector<int32> &x) const {
+    size_t ans = 0;
+    typename std::vector<int32>::const_iterator iter = x.begin(), end = x.end();
+    for (; iter != end; ++iter) {
+      ans *= kPrime;
+      ans += *iter;
+    }
+    return ans;
+  }
+ private:
+  static const int kPrime = 7853;
+};
+
+// Predefine some symbol values, because any integer is as good than any other.
+enum {
+  kEps = 0,
+  kDisambig,
+  kBos, kEos, kUnk
+};
+
+typedef std::vector<int32> HistType;
+typedef unordered_map<int32, std::pair<BaseFloat, BaseFloat> > WordToProbsMap; 
+typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType;
+typedef unordered_map<HistType, BaseFloat, IntVectorHasher> HistWeightsType;
+
+class ArpaSampling : public ArpaFileParser {
+ public:
+  // constructor
+  explicit ArpaSampling(ArpaParseOptions options, fst::SymbolTable* symbols)
+     : ArpaFileParser(options, symbols) { 
+       ngram_order_ = 0;
+       num_words_ = 0;
+       bos_symbol_ = "<s>";
+       eos_symbol_ = "</s>";
+       unk_symbol_ = "<unk>";
+  }
+  // Compute the probability of a given sentence with ngram_order LM
+  BaseFloat ComputeSentenceProb(const std::vector<int32>& test_sentence);
+  
+  // Test the read-in model by computing probs of all sentences with ngram_order LM
+  BaseFloat ComputeAllSentencesProb(const std::vector<std::vector<int32> >& test_sentences);
+  
+  void TestReadingModel();
+
+  void TestProbs(std::istream &is, bool binary);
+
+  void TestSampling();
+
+  // print history
+  void PrintHist(const HistType& h);
+  
+  void ReadHistories(std::istream &is, bool binary);
+
+  void ReadSentences(std::istream &is, std::vector<std::vector<int32> >* sentences);
+  
+ protected:
+  // ArpaFileParser overrides.
+  virtual void HeaderAvailable(); 
+  virtual void ConsumeNGram(const NGram& ngram);
+  virtual void ReadComplete() {}
+
+ private:
+  // This function returns the log probability of a ngram term from the ARPA LM
+  // if it is found; it backoffs to the lower order model when the ngram term 
+  // does not exist.
+  BaseFloat GetProb(int32 order, int32 word, const HistType& history);
+
+  // Get the back-off weight of a ngram in the read-in model
+  BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history);
+
+  // Compute a pdf of words in the vocab given a history
+  void ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf);
+  
+  // Compute weights of given histories
+  void ComputeHistoriesWeights();
+
+  // Compute weighted pdf given all histories
+  void ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* weighted_pdf);
+  
+  // Sample the next word
+  int32 SampleWord(const std::vector<std::pair<int32, BaseFloat> >& pdf);
+
+  // N-gram order of the read-in LM.
+  int32 ngram_order_;
+  
+  // num_words
+  int32 num_words_;
+
+  // Bos symbol
+  std::string bos_symbol_;
+
+  // Eos symbol
+  std::string eos_symbol_;
+
+  // Unk symbol
+  std::string unk_symbol_;
+
+  // Vocab
+  std::vector<std::pair<std::string, int32> > vocab_;
+
+  // Counts of each ngram
+  std::vector<int32> ngram_counts_;
+
+  // N-gram probabilities.
+  std::vector<NgramType> probs_;
+
+  // Histories' weights
+  HistWeightsType hists_weights_;
+  
+  // The given N Histories
+  std::vector<HistType> histories_;
+  
+  // Test sentences 
+  std::vector<std::vector<int32> > sentences_;
+};
+
+} // end of namespace kaldi
+#endif
diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc
index 793b9d3a498..5a1466b68f5 100644
--- a/src/rnnlm/rnnlm-utils-test.cc
+++ b/src/rnnlm/rnnlm-utils-test.cc
@@ -1,7 +1,13 @@
 // rnnlm/rnnlm-utils-test.cc
 
-#include <math.h>
 #include "rnnlm/rnnlm-utils.h"
+#include "arpa-sampling.h"
+
+#include <math.h>
+#include <typeinfo>
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "fst/fstlib.h"
 
 namespace kaldi {
 namespace rnnlm {
@@ -168,12 +174,38 @@ void UnitTestSamplingTime(int iters) {
 }  // end namespace rnnlm
 }  // end namespace kaldi.
 
-int main() {
+int main(int argc, char **argv) {
   using namespace kaldi;
   using namespace rnnlm;
   int N = 10000;
   UnitTestSampleWithProbOne(N);
   UnitTestSamplingTime(N);
   UnitTestSamplingConvergence();
-}
 
+  const char *usage = "";
+  ParseOptions po(usage);
+  po.Read(argc, argv);
+  std::string arpa_file = po.GetArg(1), history_file = po.GetArg(2);
+  
+  ArpaParseOptions options;
+  fst::SymbolTable symbols;
+  // Use spaces on special symbols, so we rather fail than read them by mistake.
+  symbols.AddSymbol(" <eps>", kEps);
+  symbols.AddSymbol(" #0", kDisambig);
+  options.bos_symbol = symbols.AddSymbol("<s>", kBos);
+  options.eos_symbol = symbols.AddSymbol("</s>", kEos);
+  options.unk_symbol = symbols.AddSymbol("<unk>", kUnk);
+  options.oov_handling = ArpaParseOptions::kAddToSymbols;
+  ArpaSampling mdl(options, &symbols);
+  
+  bool binary;
+  Input k1(arpa_file, &binary);
+  mdl.Read(k1.Stream(), binary);
+  mdl.TestReadingModel();
+   
+  Input k2(history_file, &binary);
+  mdl.ReadHistories(k2.Stream(), binary);
+  
+  mdl.TestSampling();
+  return 0;
+}

From 04996e8c0fe0d072742e50f13b08ea5c7724e687 Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@b02.clsp.jhu.edu>
Date: Tue, 28 Mar 2017 19:19:23 -0400
Subject: [PATCH 3/5] fix a bug in computing weights of histories

---
 src/rnnlm/arpa-sampling.cc    | 42 ++++++++++++++++++++++++++++++++++-
 src/rnnlm/arpa-sampling.h     |  2 ++
 src/rnnlm/rnnlm-utils-test.cc |  5 ++++-
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc
index f512d11fbcb..c771aa68edf 100644
--- a/src/rnnlm/arpa-sampling.cc
+++ b/src/rnnlm/arpa-sampling.cc
@@ -76,6 +76,7 @@ BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType
 void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf) {
   int32 order = history.size();
   BaseFloat prob = 0.0;
+  (*pdf).resize(num_words_); // if do not do this, (*pdf)[word] += prob will get seg fault
   for (int32 i = 0; i < num_words_; i++) {
     auto it = probs_[order].find(history);
     int32 word = vocab_[i].second; // get word from the map
@@ -126,7 +127,8 @@ void ArpaSampling::ComputeHistoriesWeights() {
         HistType h(h_tmp.begin(), last);
         int32 word = h_tmp.back();
         prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h));
-        h_tmp = h;
+        HistType h_up(h_tmp.begin() + 1, h_tmp.end());
+        h_tmp = h_up;
       }
       HistType::iterator begin = history.begin() + i;
       HistType h(begin, history.end());
@@ -244,6 +246,44 @@ void ArpaSampling::TestSampling() {
   KALDI_LOG << "Number of words OOV : " << count_nons;
 }
 
+// this function check the estimated pdfs from 1) weighted history and 2) normal computation
+// are the same
+void ArpaSampling::TestPdfsEqual() {
+  // get the weighted pdf
+  ComputeHistoriesWeights();
+  std::vector<std::pair<int32, BaseFloat> > pdf_hist_weight;
+  ComputeWeightedPdf(&pdf_hist_weight);
+  // check the averaged pdf sums to 1
+  BaseFloat sum = 0;
+  for (int32 i = 0; i < num_words_; i++) {
+    sum += pdf_hist_weight[i].second;
+  }
+  KALDI_LOG << "Sum of weighted pfd: " << sum;
+  // get the average pdf
+  std::vector<std::pair<int32, BaseFloat> > pdf;
+  pdf.resize(num_words_);
+  for (int32 i = 0; i < histories_.size(); i++) {
+    std::vector<std::pair<int32, BaseFloat> > pdf_h;
+    ComputeWordPdf(histories_[i], &pdf_h);
+    for(int32 j = 0; j < pdf_h.size(); j++) {
+      pdf[j].first = pdf_h[j].first;
+      pdf[j].second += pdf_h[j].second / histories_.size();
+    }
+  }
+  // check the averaged pdf sums to 1
+  sum = 0;
+  for (int32 i = 0; i < num_words_; i++) {
+    sum += pdf[i].second;
+  }
+  KALDI_LOG << "Sum of averaged pdf: " << sum;
+  // check equality of the two pdfs
+  BaseFloat diff = 0;
+  for (int32 i = 0; i < num_words_; i++) {
+    diff += abs(pdf_hist_weight[i].second - pdf[i].second);
+  }
+  KALDI_LOG << " diff of the two pdfs: " << diff;
+  
+}
 // this function returns the log probability of the given sentence
 BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector<int32>& sentence) {
   BaseFloat prob = 0;
diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h
index bad3b08953f..699dbf09d69 100644
--- a/src/rnnlm/arpa-sampling.h
+++ b/src/rnnlm/arpa-sampling.h
@@ -109,6 +109,8 @@ class ArpaSampling : public ArpaFileParser {
   void TestProbs(std::istream &is, bool binary);
 
   void TestSampling();
+  
+  void TestPdfsEqual();
 
   // print history
   void PrintHist(const HistType& h);
diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc
index 5a1466b68f5..a4f2b7abd79 100644
--- a/src/rnnlm/rnnlm-utils-test.cc
+++ b/src/rnnlm/rnnlm-utils-test.cc
@@ -205,7 +205,10 @@ int main(int argc, char **argv) {
    
   Input k2(history_file, &binary);
   mdl.ReadHistories(k2.Stream(), binary);
-  
+  // command for running the test binary: ./test-binary arpa-file history-file
+  // arpa-file is the ARPA-format language model
+  // history-file has lines of histories, one history per line
+  mdl.TestPdfsEqual(); 
   mdl.TestSampling();
   return 0;
 }

From 62e5f9b2c641a692eaa7b36896ed5b8e54643f4d Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@b02.clsp.jhu.edu>
Date: Thu, 6 Apr 2017 02:10:00 -0400
Subject: [PATCH 4/5] Add history-weight test

---
 src/rnnlm/arpa-sampling.cc    | 111 +++------
 src/rnnlm/arpa-sampling.h     |  31 +--
 src/rnnlm/rnnlm-utils-test.cc |  11 +-
 src/rnnlm/sample_a_word.cc    | 432 ----------------------------------
 src/rnnlm/sample_a_word.h     | 159 -------------
 5 files changed, 43 insertions(+), 701 deletions(-)
 delete mode 100644 src/rnnlm/sample_a_word.cc
 delete mode 100644 src/rnnlm/sample_a_word.h

diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc
index c771aa68edf..d7be9ea7f3b 100644
--- a/src/rnnlm/arpa-sampling.cc
+++ b/src/rnnlm/arpa-sampling.cc
@@ -135,12 +135,12 @@ void ArpaSampling::ComputeHistoriesWeights() {
       hists_weights_[h] += prob;
     }
   }
-  KALDI_LOG << "Size of hists_weights_ is: " << hists_weights_.size();
 }
 
 // Get weighted pdf
 void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* pdf_w) {
   BaseFloat prob = 0;
+  (*pdf_w).clear();
   (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault
   for (int32 i = 0; i < num_words_; i++) {
     for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) {
@@ -172,84 +172,31 @@ void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >*
   } // end reading words
 }
 
-// sample a word that follows a pdf
-int32 ArpaSampling::SampleWord(const std::vector<std::pair<int32, BaseFloat> >& pdf) {
-  // generate a cdf from the given pdf 
-  std::vector<std::pair<int32, BaseFloat> > cdf;
-  BaseFloat upper = 0;
-  int32 word;
-  std::pair<int32, BaseFloat> probs;
-  for (int32 i = 0; i < num_words_; i++) {
-    upper += pdf[i].second;
-    word = vocab_[i].second;
-    probs = std::make_pair(word, upper);
-    cdf.push_back(probs);
-  }
-  BaseFloat u = 1.0 * RandUniform();
-  if (u >= 0 && u < cdf[1].second) {
-    return cdf[0].first;
-  }
-  for (int32 i = 1; i < num_words_; i++) {
-    if (cdf[i - 1].second <= u && u < cdf[i].second) {
-      return cdf[i].first;
+void ArpaSampling::RandomGenerateHistories() {
+  // clear previous histories
+  histories_.clear();
+  // randomly generate histories
+  int32 num_histories = rand() % 1000 + 5; // generate at least 5 histories
+  for (int32 i = 0; i < num_histories; i++) {
+    HistType hist;
+    // size of history should be in {1, 2, ..., ngram_order_}
+    int32 size_hist = rand() % (ngram_order_ - 1) + 1;
+    KALDI_ASSERT(size_hist <= ngram_order_);
+    for (int32 j = 0; j < size_hist; j++) {
+      // word can not be zero since zero represents epsilon in the fst symbol format
+      int32 word = rand() % (vocab_.size() - 1) + 1;
+      KALDI_ASSERT(word > 0 && word <= vocab_.size());
+      hist.push_back(word);
     }
+    histories_.push_back(hist);
   }
-  return -1;
 }
 
-// Sample a word
-void ArpaSampling::TestSampling() {
-  ComputeHistoriesWeights();
-  std::vector<std::pair<int32, BaseFloat> > pdf;
-  ComputeWeightedPdf(&pdf);
-  BaseFloat sum = 0;
-  for (int32 i = 0; i < num_words_; i++) {
-    sum += pdf[i].second;
-  }
-
-  // Check convergence 
-  unordered_map<int32, BaseFloat> pdf_est;
-  int32 word;
-  int32 count_nons = 0;
-  int32 count = 0;
-  for (int32 i = 0; ; i++) {
-    word = SampleWord(pdf);
-    if (word > num_words_ || word < 0) {
-      KALDI_LOG << "the next word is " << word;
-      count_nons += 1;
-      continue;
-    } else {
-      auto it = pdf_est.find(word);
-      if (it == pdf_est.end()) {
-        pdf_est.insert({word, 1.0});
-      } else {
-        pdf_est[word] += 1.0;
-      }
-    }
-    count++;
-    if (count % 1000 == 0) {
-      // normalization
-      BaseFloat ed = 0;
-      for (int32 i = 0; i < num_words_; i++) {
-        int32 word = vocab_[i].second;
-        pdf_est[word] /= count;
-        ed += pow(pdf_est[word] - pdf[i].second, 2); 
-      }
-      ed = pow(ed, 0.5);
-      // KALDI_LOG << "Run " << count << " times, Euclidean distance is " << ed;
-      if (ed <= 0.05) {
-        KALDI_LOG << "Run " << count << " times, Euclidean distance (expect <= 0.05) is " << ed;
-        break;
-      } 
-    }
-  }
-  KALDI_LOG << "Number of words OOV : " << count_nons;
-}
-
-// this function check the estimated pdfs from 1) weighted history and 2) normal computation
-// are the same
+// this function checks the two estimated pdfs from 1) weighted history 
+// and 2) normal computation are the same
 void ArpaSampling::TestPdfsEqual() {
-  // get the weighted pdf
+  RandomGenerateHistories();
+  hists_weights_.clear();
   ComputeHistoriesWeights();
   std::vector<std::pair<int32, BaseFloat> > pdf_hist_weight;
   ComputeWeightedPdf(&pdf_hist_weight);
@@ -258,7 +205,7 @@ void ArpaSampling::TestPdfsEqual() {
   for (int32 i = 0; i < num_words_; i++) {
     sum += pdf_hist_weight[i].second;
   }
-  KALDI_LOG << "Sum of weighted pfd: " << sum;
+  KALDI_ASSERT(ApproxEqual(sum, 1.0));
   // get the average pdf
   std::vector<std::pair<int32, BaseFloat> > pdf;
   pdf.resize(num_words_);
@@ -275,15 +222,15 @@ void ArpaSampling::TestPdfsEqual() {
   for (int32 i = 0; i < num_words_; i++) {
     sum += pdf[i].second;
   }
-  KALDI_LOG << "Sum of averaged pdf: " << sum;
+  KALDI_ASSERT(ApproxEqual(sum, 1.0));
   // check equality of the two pdfs
   BaseFloat diff = 0;
   for (int32 i = 0; i < num_words_; i++) {
     diff += abs(pdf_hist_weight[i].second - pdf[i].second);
   }
-  KALDI_LOG << " diff of the two pdfs: " << diff;
-  
+  KALDI_ASSERT(ApproxEqual(diff, 0.0));
 }
+
 // this function returns the log probability of the given sentence
 BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector<int32>& sentence) {
   BaseFloat prob = 0;
@@ -339,6 +286,10 @@ void ArpaSampling::TestProbs(std::istream &is, bool binary) {
 void ArpaSampling::TestReadingModel() {
   KALDI_LOG << "Testing model reading part..."<< std::endl;
   KALDI_LOG << "Vocab size is: " << vocab_.size();
+  std::cout << "Print out vocab: " << std::endl;
+  for (int i = 0; i < vocab_.size(); i++) {
+    std::cout << i << " , " << vocab_[i].first << " , " << vocab_[i].second << std::endl;
+  }
   KALDI_LOG << "Ngram_order is: " << ngram_order_;
   KALDI_ASSERT(probs_.size() == ngram_counts_.size());
   for (int32 i = 0; i < ngram_order_; i++) {
@@ -415,7 +366,7 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) {
   }
   const fst::SymbolTable* sym = Symbols();
   std::string line;
-  KALDI_LOG << "Start reading histories...";
+  KALDI_LOG << "Start reading histories from file...";
   while (getline(is, line)) {
     std::istringstream is(line);
     std::istream_iterator<std::string> begin(is), end;
@@ -436,7 +387,7 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) {
     }
     histories_.push_back(history);
   }
-  KALDI_LOG << "Finished reading histories.";
+  KALDI_LOG << "Finished reading histories from file.";
 }
 
 } // end of kaldi
diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h
index 699dbf09d69..1fdedeb573a 100644
--- a/src/rnnlm/arpa-sampling.h
+++ b/src/rnnlm/arpa-sampling.h
@@ -25,36 +25,15 @@
 #include "lm/arpa-file-parser.h"
 #include "fst/fstlib.h"
 
-#ifdef _MSC_VER
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#else
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
-using std::tr1::unordered_map;
-using std::tr1::unordered_set;
-#endif
-
-#include <cassert>
 #include <stdlib.h>
 #include <math.h>
 #include <algorithm>
 #include <map>
-#include <set>
 #include <string>
 #include <sstream>
 #include <fstream>
 #include <vector>
 #include <iostream>
-#include <queue>
-#include <limits>
 
 namespace kaldi {
 
@@ -78,7 +57,7 @@ struct IntVectorHasher {  // hashing function for vector<Int>.
 // Predefine some symbol values, because any integer is as good than any other.
 enum {
   kEps = 0,
-  kDisambig,
+  // kDisambig,
   kBos, kEos, kUnk
 };
 
@@ -108,8 +87,6 @@ class ArpaSampling : public ArpaFileParser {
 
   void TestProbs(std::istream &is, bool binary);
 
-  void TestSampling();
-  
   void TestPdfsEqual();
 
   // print history
@@ -133,6 +110,9 @@ class ArpaSampling : public ArpaFileParser {
 
   // Get the back-off weight of a ngram in the read-in model
   BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history);
+  
+  // For test: randomly generate histories
+  void RandomGenerateHistories();
 
   // Compute a pdf of words in the vocab given a history
   void ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf);
@@ -143,9 +123,6 @@ class ArpaSampling : public ArpaFileParser {
   // Compute weighted pdf given all histories
   void ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* weighted_pdf);
   
-  // Sample the next word
-  int32 SampleWord(const std::vector<std::pair<int32, BaseFloat> >& pdf);
-
   // N-gram order of the read-in LM.
   int32 ngram_order_;
   
diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc
index a4f2b7abd79..0f5673b2035 100644
--- a/src/rnnlm/rnnlm-utils-test.cc
+++ b/src/rnnlm/rnnlm-utils-test.cc
@@ -191,7 +191,7 @@ int main(int argc, char **argv) {
   fst::SymbolTable symbols;
   // Use spaces on special symbols, so we rather fail than read them by mistake.
   symbols.AddSymbol(" <eps>", kEps);
-  symbols.AddSymbol(" #0", kDisambig);
+  // symbols.AddSymbol(" #0", kDisambig);
   options.bos_symbol = symbols.AddSymbol("<s>", kBos);
   options.eos_symbol = symbols.AddSymbol("</s>", kEos);
   options.unk_symbol = symbols.AddSymbol("<unk>", kUnk);
@@ -208,7 +208,12 @@ int main(int argc, char **argv) {
   // command for running the test binary: ./test-binary arpa-file history-file
   // arpa-file is the ARPA-format language model
   // history-file has lines of histories, one history per line
-  mdl.TestPdfsEqual(); 
-  mdl.TestSampling();
+
+  // this test can be slow
+  KALDI_LOG << "Start weighted histories test...";
+  for (int i = 0; i < N / 100; i++) {
+    mdl.TestPdfsEqual(); 
+  }
+  KALDI_LOG << "Successfuly pass the test.";
   return 0;
 }
diff --git a/src/rnnlm/sample_a_word.cc b/src/rnnlm/sample_a_word.cc
deleted file mode 100644
index 8761291d525..00000000000
--- a/src/rnnlm/sample_a_word.cc
+++ /dev/null
@@ -1,432 +0,0 @@
-// sample_a_word.cc
-
-#include "sample_a_word.h"
-#include <iostream>
-#include <string>
-#include <iterator>
-#include <algorithm>
-#include <math.h>
-
-// Constructor for sampling the next word
-NgramModel::NgramModel(char* arpa_file, char* histories_file) {
-  vocab_size_ = 0;
-  ReadARPAModel(arpa_file);
-  ReadHistories(histories_file);
-}
-
-// Read language model from a ARPA-format file.
-void NgramModel::ReadARPAModel(char* file) {
-  std::ifstream data_input(file);
-  if (!data_input.is_open()) {
-    std::cerr << "error opening '" << file
-              << "' for reading\n";
-    exit(1);
-  }
-  std::string line;
-  int32 order;
-  int32 order_current = 0;
-  int32 word;
-  int32 iter = 0;
-  int32 while_iter = 0;
-  std::pair <float, float> probs_pair;
-  float log_prob;
-  float backoff_weight;
-  bool unigram_check = false;
-  std::cout << "Start reading ARPA-format file..." << std::endl;
-  while (getline(data_input, line)) {
-    std::istringstream is(line);
-    // get the strings splitted by single space
-    // brace-initialization with C++11
-    std::istream_iterator<std::string> begin(is), end;
-    std::vector<std::string> tokens(begin, end);
-    if (tokens.size() == 0) continue;
-    if (tokens.size() == 2 && tokens[0] == "ngram") {
-      std::string substring = tokens[1].substr(2);
-      int32 count = std::stoi(substring); // get "123456" from "1=123456"
-      counts_.push_back(count);
-      order = std::stoi(tokens[1].substr(0));
-      continue;
-    }
-    if (tokens.size() == 1 && tokens[0] == "\\1-grams:") {
-      ngram_order_ = order; // ngram_order
-      probs_.resize(ngram_order_);
-      std::cout << "Ngram order is: " << ngram_order_ << std::endl;
-    }
-    // read current order
-    if (tokens.size() == 1 && tokens[0] != "\\data\\" &&
-        tokens[0] != "\\end\\") {
-      order_current = std::stoi(tokens[0].substr(1,1));
-      continue; // get the order info and skip processing this line
-    }
-    // read vocab and initialize probs of unigrams
-    if (order_current == 1) {
-      std::string word_s;
-      if (tokens.back() != "</s>") {
-        word_s = tokens.end()[-2];
-        backoff_weight = std::stof(tokens.back());
-      } else {
-        word_s = tokens.back();
-        backoff_weight = 0;
-      }
-      word = iter;
-      vocab_.insert({word_s, word});
-      iter++;
-      vocab_size_++;
-      if (iter == counts_[0]) {
-        bool unigram_check = true;
-        std::cout << "vocab size: " << vocab_size_ << std::endl;
-      }
-      HistType history;
-      history.resize(0);
-      log_prob = std::stof(tokens[0]);
-      probs_pair = std::make_pair(log_prob, backoff_weight);
-      probs_[order_current - 1][history].insert({word, probs_pair});
-      continue;
-    } 
-    // read each ngram and its log-probs and back-off weights
-    // read probs of order 1 to N - 1
-    if (order_current < ngram_order_ && order_current > 1) {
-      // case1: backoff_weights exist
-      if ((tokens.size() > order_current + 1) && (tokens.back() != "</s>") && tokens[0] != "ngram") {
-        // get the integer for word, the last second string in tokens
-        std::string second_last = tokens.end()[-2];
-        unordered_map<std::string, int32>::iterator it = vocab_.find(second_last);
-        if (it != vocab_.end()) {
-          word = it->second;
-        } else {
-          std::cout << "OOV word found: " << tokens.end()[-2] << std::endl;
-        }
-        int32 len_hist = tokens.size() - 3; // exclude the word, log-prob, and bow
-        HistType history;
-        for (int32 i = 1; i < len_hist + 1; i++) {
-          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
-          if (it != vocab_.end()) {
-            history.push_back(it->second);
-          } else {
-            std::cout << "OOV found in history: " << tokens[i] << std::endl;
-          }
-        }
-        assert (history.size() == order_current - 1);
-        log_prob = std::stof(tokens[0]);
-        backoff_weight = std::stof(tokens.back());
-        probs_pair = std::make_pair(log_prob, backoff_weight);
-        probs_[order_current - 1][history].insert({word, probs_pair});
-        continue;
-      }
-      // case2: no backoff_weights
-      if (tokens.size() == order_current + 1 && (tokens.back() == "</s>") && tokens[0] != "ngram") {
-        unordered_map<std::string, int32>::iterator it = vocab_.find(tokens.back());
-        if (it != vocab_.end()) {
-          word = it->second;
-        } 
-        int32 len_hist = tokens.size() - 2; // exclude the word and log-prob
-        HistType history;
-        assert (len_hist > 0);
-        for (int32 i = 1; i < len_hist + 1; i++) {
-          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
-          if (it != vocab_.end()) {
-            history.push_back(it->second);
-          } else {
-            std::cout << "OOV found in history: " << tokens[i] << std::endl;
-          }
-        }
-        assert (history.size() == order_current - 1);
-        log_prob = std::stof(tokens[0]);
-        backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff)
-        probs_pair = std::make_pair(log_prob, backoff_weight);
-        probs_[order_current - 1][history].insert({word, probs_pair});
-        continue;
-      }
-    } else if (order_current == ngram_order_) { // read probs of order N
-      if (tokens.size() > 2) {
-        std::string word_s = tokens.back();
-        unordered_map<std::string, int32>::iterator it = vocab_.find(word_s);
-        if (it != vocab_.end()) {
-          word = it->second;
-        }
-        int32 len_hist = tokens.size() - 2; // exclude the word and log-prob
-        HistType history;
-        assert (len_hist > 0);
-        for (int32 i = 1; i < len_hist + 1; i++) {
-          unordered_map<std::string, int32>::iterator it = vocab_.find(tokens[i]);
-          if (it != vocab_.end()) {
-            history.push_back(it->second);
-          } else {
-            std::cout << "OOV found in history: " << tokens[i] << std::endl;
-          }
-        }
-        log_prob = std::stof(tokens[0]);
-        backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff)
-        probs_pair = std::make_pair(log_prob, backoff_weight);
-        probs_[order_current - 1][history].insert({word, probs_pair});
-        continue;
-      }
-    }
-  }
-  std::cout << "Finish reading ARPA-format file." << std::endl;
-}
-
-float NgramModel::GetProb(int32 order, const int32 word, const HistType& history) {
-  float prob = 0.0;
-  auto it = probs_[order - 1].find(history);
-  if (it != probs_[order - 1].end() &&
-      probs_[order-1][history].find(word) != probs_[order-1][history].end()) {
-    prob += probs_[order-1][history][word].first;
-  } else { // backoff to the previous order
-    order--;
-    if (order >= 1) {
-      HistType::const_iterator first = history.begin() + 1;
-      HistType::const_iterator last = history.end();
-      HistType h(first, last);
-      prob += GetProb(order, word, h);
-      int32 word_new = history.back();
-      HistType::const_iterator last_new = history.end() - 1;
-      HistType h_new(history.begin(), last_new);
-      prob += GetBackoffWeight(order, word_new, h_new);
-    }
-  }
-  return prob;
-}
-
-float NgramModel::GetBackoffWeight(int32 order, const int32 word, const HistType& history) {
-  float bow = 0.0;
-  auto it = probs_[order - 1].find(history);
-  if (it != probs_[order - 1].end()) {
-    auto it2 = probs_[order - 1][history].find(word);
-    if (it2 != probs_[order - 1][history].end()) {
-      bow = (it2->second).second;
-    }
-  }
-  return bow;
-}
-
-void NgramModel::ComputeWordPdf(const HistType& history, std::vector<float>* pdf) {
-  int32 order = history.size();
-  float prob = 0.0;
-  for (int32 i = 0; i < vocab_size_; i++) {
-    auto it = probs_[order].find(history);
-    int32 word = i;
-    if (it != probs_[order].end()) {
-      auto it2 = probs_[order][history].find(word);
-      if (it2 != probs_[order][history].end()) {
-        prob = pow(10, (it2->second).first);
-        (*pdf).push_back(prob);
-      } else {
-        HistType::const_iterator first = history.begin() + 1;
-        HistType::const_iterator last = history.end();
-        HistType h(first, last);
-        int32 word_new = history.back();
-        HistType::const_iterator last_new = history.end() - 1;
-        HistType h_new(history.begin(), last_new);
-        prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
-               pow(10, GetProb(order, word, h));
-        (*pdf).push_back(prob);
-      }
-    } else {
-      HistType::const_iterator first = history.begin() + 1;
-      HistType::const_iterator last = history.end();
-      HistType h(first, last);
-      int32 word_new = history.back();
-      HistType::const_iterator last_new = history.end() - 1;
-      HistType h_new(history.begin(), last_new);
-      prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
-             pow(10, GetProb(order, word, h));
-      (*pdf).push_back(prob);
-    }
-  }
-}
-
-// Get history weights
-void NgramModel::ComputeHistoriesWeights() {
-  for (auto it = histories_.begin(); it != histories_.end(); ++it) {
-    HistType history(*(it));
-    assert(history.size() <= ngram_order_);
-    for (int32 i = 0; i < history.size() + 1; i++) {
-      HistType h_tmp = history;
-      float prob = 1.0 / histories_.size();
-      while (h_tmp.size() > (history.size() - i)) {
-        HistType::iterator last = h_tmp.end() - 1;
-        HistType h(h_tmp.begin(), last);
-        int32 word = h_tmp.back();
-        prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h));
-        h_tmp = h;
-      }
-      HistType::iterator begin = history.begin() + i;
-      HistType h(begin, history.end());
-      hists_weights_[h] += prob;
-    }
-  } 
-  std::cout << "Size of hists_weights_ is: " << hists_weights_.size() << std::endl;
-}
-
-// Get weighted pdf
-void NgramModel::ComputeWeightedPdf(std::vector<float>* pdf_w) {
-  float prob = 0;
-  (*pdf_w).resize(vocab_size_); // if do not do this, (*pdf_w)[word] += prob will get seg fault
-  for (int32 i = 0; i < vocab_size_; i++) {
-    for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) {
-      HistType h(it->first);
-      int32 order = h.size();
-      auto it_hist = probs_[order].find(h);
-      if (it_hist != probs_[order].end()) {
-        int32 word = i;
-        auto it_word = probs_[order][h].find(word);
-        if (it_word != probs_[order][h].end()) {
-          if (order > 0) {
-            HistType::iterator last = h.end() - 1;
-            HistType::iterator first = h.begin() + 1;
-            HistType h1(h.begin(), last);
-            HistType h2(first, h.end());
-            prob = (it->second) * (pow(10, probs_[order][h][word].first) - 
-                    pow(10, GetBackoffWeight(order, h.back(), h1))
-                    * pow(10, GetProb(order, word, h2)));
-            (*pdf_w)[word] += prob;
-          } 
-          else {
-            prob = (it->second) * pow(10, probs_[order][h][word].first);
-            (*pdf_w)[word] += prob;
-          }
-        }
-      }
-    } // end reading history
-  } // end reading words
-}
-
-// sample a word that follows a pdf
-int32 NgramModel::SampleWord(const std::vector<float>& pdf) {
-  // generate a cdf from the given pdf 
-  std::vector<std::pair<float, float> > cdf;
-  float upper = 0;
-  float lower = 0;
-  std::pair<float, float> probs;
-  for (int32 i = 0; i < pdf.size(); i++) {
-    upper += pdf[i];
-    lower = upper - pdf[i];
-    probs = std::make_pair(lower, upper);
-    cdf.push_back(probs);
-  }
-  float u = 1.0 * rand()/RAND_MAX;
-  for (int32 i = 0; i < cdf.size(); i++) {
-    if (cdf[i].first <= u < cdf[i].second) {
-      return i;
-    }
-  }
-}
-
-// Sampling a word
-void NgramModel::TestSampling(int32 iters) {
-  ComputeHistoriesWeights();
-  std::vector<float> pdf;
-  ComputeWeightedPdf(&pdf);
-
-  // Compute diff
-  std::vector<float> pdf_est;
-  pdf_est.resize(vocab_size_);
-  int32 word;
-  int32 count_nons = 0;
-  for (int32 i = 0; i < iters; i++) {
-    word = SampleWord(pdf);
-    if (word > vocab_size_ || word < 0) {
-      std::cout << "the next word is " << word << std::endl;
-      count_nons += 1;
-      continue;
-    } else {
-      pdf_est[word] += 1.0;
-    }
-  }
-  // normalization
-  float ed = 0;
-  for (int32 i = 0; i < vocab_size_; i++) {
-    pdf_est[word] /= iters;
-    ed += pow(pdf_est[word] - pdf[word], 2); 
-  }
-  ed = pow(ed, 0.5);
-  std::cout << "Run " << iters << " times, e distance (expect < 0.05) is " << ed << std::endl;
-  std::cout << "Number of words OOV : " << count_nons << std::endl;
-}
-
-// Test the read-in language model
-void NgramModel::TestReadingModel() {
-  std::cout << "Testing model reading part..."<< std::endl;
-  std::cout << "Vocab size is: " << vocab_size_ << std::endl;
-  std::cout << "Ngram_order is: " << ngram_order_ << std::endl;
-  assert(probs_.size() == counts_.size());
-  for (int32 i = 0; i < ngram_order_; i++) {
-    int32 size_ngrams = 0;
-    std::cout << "Test: for order " << (i + 1) << std::endl;
-    std::cout << "Expected number of " << (i + 1) << "-grams: " << counts_[i] << std::endl;
-    for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) {
-      HistType h(it1->first);
-      for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) {
-        size_ngrams++; // number of words given
-      }
-    }
-    std::cout << "Read in number of " << (i + 1) << "-grams: " << size_ngrams << std::endl;
-  }
-  std::cout << "Assert sum of unigram probs equal to 1..." << std::endl;
-  float prob_sum = 0.0;
-  int32 count = 0;
-  for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) {
-    HistType h(it1->first);
-    for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) {
-      prob_sum += 1.0 * pow(10.0, (it2->second).first);
-      count++;
-    }
-  }
-  std::cout << "Number of total words: " << count << std::endl;
-  std::cout << "Sum of unigram probs equal to " << prob_sum << std::endl;
-
-  std::cout << "Assert sum of bigram probs given a history equal to 1..." << std::endl;
-  prob_sum = 0.0;
-  auto it1 = probs_[1].begin();
-  HistType h(it1->first);
-  for (auto it = vocab_.begin(); it != vocab_.end(); ++it) {
-    auto it2 = probs_[1][h].find(it->second);
-    if (it2 != probs_[1][h].end()) {
-      prob_sum += 1.0 * pow(10, (it2->second).first);
-    } else {
-      prob_sum += pow(10, GetProb(2, it->second, h));
-    }
-  }
-  std::cout << "Sum of bigram probs given a history equal to " << prob_sum << std::endl;
-
-}
-
-// Read histories of integers from a file
-void NgramModel::ReadHistories(char* file) {
- std::ifstream data_input(file);
-  if (!data_input.is_open()) {
-    std::cerr << "error opening '" << file
-              << "' for reading\n";
-    exit(1);
-  }
-  std::string line;
-  std::cout << "Start reading histories..." << std::endl;
-  while (getline(data_input, line)) {
-    std::istringstream is(line);
-    std::istream_iterator<std::string> begin(is), end;
-    std::vector<std::string> tokens(begin, end);
-    HistType history;
-    int32 word;
-    for (int32 i = 0; i < tokens.size(); i++) {
-      auto it = vocab_.find(tokens[i]);
-      if (it != vocab_.end()) {
-        word = it->second;
-      } else {
-        std::string word_s = "<unk>";
-        auto it_unk = vocab_.find(word_s);
-        assert (it_unk != vocab_.end());
-        word = it_unk->second;
-      }
-      history.push_back(word);
-    }
-    if (history.size() >= ngram_order_) {
-      // TODO: try slicing it later
-      std::reverse(history.begin(), history.end());
-      history.resize(ngram_order_ - 1);
-      std::reverse(history.begin(), history.end());
-    }
-    histories_.push_back(history);
-  }
-  std::cout << "Finished reading histories." << std::endl;
-}
diff --git a/src/rnnlm/sample_a_word.h b/src/rnnlm/sample_a_word.h
deleted file mode 100644
index 86fa4e1d4ee..00000000000
--- a/src/rnnlm/sample_a_word.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// sample_a_word.h
-
-// Copyright     2016  Ke Li
-
-// See ../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABILITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SAMPLE_A_WORD_H_
-#define SAMPLE_A_WORD_H_
-
-#include <sys/time.h>
-#include <unistd.h>
-
-#ifdef _MSC_VER
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__)
-#include <unordered_map>
-#include <unordered_set>
-using std::unordered_map;
-using std::unordered_set;
-#else
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
-using std::tr1::unordered_map;
-using std::tr1::unordered_set;
-#endif
-
-#include <cassert>
-#include <stdlib.h>
-#include <math.h>
-#include <algorithm>
-#include <map>
-#include <set>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <vector>
-#include <iostream>
-#include <queue>
-#include <limits>
-
-typedef int32_t int32;
-
-/// A hashing function-object for vectors of ints.
-struct IntVectorHasher {  // hashing function for vector<Int>.
-  size_t operator()(const std::vector<int32> &x) const {
-    size_t ans = 0;
-    typename std::vector<int32>::const_iterator iter = x.begin(), end = x.end();
-    for (; iter != end; ++iter) {
-      ans *= kPrime;
-      ans += *iter;
-    }
-    return ans;
-  }
- private:
-  static const int kPrime = 7853;
-};
-
-typedef std::vector<int32> HistType;
-typedef unordered_map<int32, std::pair<float, float> > WordToProbsMap; 
-typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType;
-typedef unordered_map<HistType, float, IntVectorHasher> HistWeightsType;
-
-class Timer {
- public:
-  Timer() { Reset(); }
-
-  void Reset() { gettimeofday(&this->time_start_, &time_zone_); }
-
-  /// Returns time in seconds.
-  double Elapsed() {
-    struct timeval time_end;
-    gettimeofday(&time_end, &time_zone_);
-    double t1, t2;
-    t1 =  static_cast<double>(time_start_.tv_sec) +
-          static_cast<double>(time_start_.tv_usec)/(1000*1000);
-    t2 =  static_cast<double>(time_end.tv_sec) +
-          static_cast<double>(time_end.tv_usec)/(1000*1000);
-    return t2-t1;
-  }
-
- private:
-  struct timeval time_start_;
-  struct timezone time_zone_;
-};
-
-class NgramModel {
- public:
-  // Constructor for testing
-  NgramModel(char* arpa_file, char* histories_file);
-  
-  void TestReadingModel();
-
-  void TestSampling(int32 iters);
-  
- private:
-  // This function returns the log probability of a ngram term from the ARPA LM
-  // if it is found; it backoffs to the lower order model when the ngram term 
-  // does not exist.
-  float GetProb(int32 order, const int32 word, const HistType& history);
-
-  // Get the back-off weight of a ngram in the read-in model
-  float GetBackoffWeight(int32 order, const int32 word, const HistType& history);
-
-  // Compute a pdf of words in the vocab given a history
-  void ComputeWordPdf(const HistType& history, std::vector<float>* pdf);
-  
-  // Compute weights of given histories
-  void ComputeHistoriesWeights();
-  // Compute weighted pdf given all histories
-  void ComputeWeightedPdf(std::vector<float>* weighted_pdf);
-  
-  // Sample the next word
-  int32 SampleWord(const std::vector<float>& pdf);
-  
-  // Read the language model prob_ from stream
-  // Called from constructor; Check the sum of unigrams
-  void ReadARPAModel(char* arpa_file);
-  
-  void ReadHistories(char* file);
-
-  // N-gram order of the read-in LM.
-  int32 ngram_order_;
-  
-  // Counts of each ngram
-  std::vector<int32> counts_; 
-
-  // Vocab size
-  int32 vocab_size_;
-
-  // Vocab
-  unordered_map<std::string, int32> vocab_;
-   
-  // N-gram probabilities.
-  std::vector<NgramType> probs_;
-
-  // Histories' weights
-  HistWeightsType hists_weights_;
-  
-  // The given N Histories
-  std::vector<HistType> histories_;
-};
-
-#endif

From 21034856fc578e09ef88b3a0c56ffe0b4026fd72 Mon Sep 17 00:00:00 2001
From: Ke Li <keli1@b02.clsp.jhu.edu>
Date: Fri, 28 Apr 2017 00:06:05 -0400
Subject: [PATCH 5/5] Add ComputeOutputWords function; remove auto; remove
 histories and hists_weights as class members

---
 src/rnnlm/arpa-sampling.cc    | 234 +++++++++++++++-------------------
 src/rnnlm/arpa-sampling.h     |  73 ++++-------
 src/rnnlm/rnnlm-utils-test.cc |   7 +-
 3 files changed, 133 insertions(+), 181 deletions(-)

diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc
index d7be9ea7f3b..faf380528d0 100644
--- a/src/rnnlm/arpa-sampling.cc
+++ b/src/rnnlm/arpa-sampling.cc
@@ -9,6 +9,7 @@
 
 namespace kaldi {
 
+/// this function reads each ngram line in the ARPA file
 void ArpaSampling::ConsumeNGram(const NGram& ngram) {
   int32 cur_order = ngram.words.size();
   int32 word = ngram.words.back(); // word is the last word in vector words
@@ -39,9 +40,13 @@ void ArpaSampling::HeaderAvailable() {
   probs_.resize(ngram_order_);
 }
 
+// this function returns the probability of the ngram (history, word) for given
+// order if the history and the word given the history exists. 
+// Otherwise it backoff to previous order to recursively search the lower order
+// ngram until backoff to unigram. 
 BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history) {
   BaseFloat prob = 0.0;
-  auto it = probs_[order - 1].find(history);
+  NgramType::const_iterator it = probs_[order - 1].find(history);
   if (it != probs_[order - 1].end() &&
       probs_[order-1][history].find(word) != probs_[order-1][history].end()) {
     prob += probs_[order-1][history][word].first;
@@ -61,29 +66,31 @@ BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history
   return prob;
 }
 
+// this function returns the backoff weight of the ngram (history, word)
 BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType& history) {
   BaseFloat bow = 0.0;
-  auto it = probs_[order - 1].find(history);
+  NgramType::const_iterator it = probs_[order - 1].find(history);
   if (it != probs_[order - 1].end()) {
-    auto it2 = probs_[order - 1][history].find(word);
+    WordToProbsMap::const_iterator it2 = probs_[order - 1][history].find(word);
     if (it2 != probs_[order - 1][history].end()) {
-      bow = (it2->second).second;
+      bow = it2->second.second;
     }
   }
   return bow;
 }
 
+// this function computes the estimated pdf given a history
 void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf) {
   int32 order = history.size();
   BaseFloat prob = 0.0;
-  (*pdf).resize(num_words_); // if do not do this, (*pdf)[word] += prob will get seg fault
+  (*pdf).resize(num_words_); 
   for (int32 i = 0; i < num_words_; i++) {
-    auto it = probs_[order].find(history);
-    int32 word = vocab_[i].second; // get word from the map
+    NgramType::const_iterator it = probs_[order].find(history);
+    int32 word = vocab_[i].second; 
     if (it != probs_[order].end()) {
-      auto it2 = probs_[order][history].find(word);
+      WordToProbsMap::const_iterator it2 = probs_[order][history].find(word);
       if (it2 != probs_[order][history].end()) {
-        prob = pow(10, (it2->second).first);
+        prob = pow(10, it2->second.first);
         (*pdf)[i].first = word;
         (*pdf)[i].second += prob;
       } else {
@@ -93,9 +100,7 @@ void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector<std::pair
         int32 word_new = history.back();
         HistType::const_iterator last_new = history.end() - 1;
         HistType h_new(history.begin(), last_new);
-        prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
- 
-          pow(10, GetProb(order, word, h));
+        prob = pow(10, GetBackoffWeight(order, word_new, h_new) + GetProb(order, word, h));
         (*pdf)[i].first = word;
         (*pdf)[i].second += prob;
       }
@@ -106,22 +111,23 @@ void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector<std::pair
       int32 word_new = history.back();
       HistType::const_iterator last_new = history.end() - 1;
       HistType h_new(history.begin(), last_new);
-      prob = pow(10, GetBackoffWeight(order, word_new, h_new)) *
-             pow(10, GetProb(order, word, h));
+      prob = pow(10, GetBackoffWeight(order, word_new, h_new) + GetProb(order, word, h));
       (*pdf)[i].first = word;
       (*pdf)[i].second += prob;
     }
   }
 }
 
-// Get history weights
-void ArpaSampling::ComputeHistoriesWeights() {
-  for (auto it = histories_.begin(); it != histories_.end(); ++it) {
+// this function computes history weights for given histories
+// the total weights of histories is 1
+HistWeightsType ArpaSampling::ComputeHistoriesWeights(std::vector<HistType> histories) {
+  HistWeightsType hists_weights;
+  for (std::vector<HistType>::iterator it = histories.begin(); it != histories.end(); ++it) {
     HistType history(*(it));
     KALDI_ASSERT(history.size() <= ngram_order_);
     for (int32 i = 0; i < history.size() + 1; i++) {
       HistType h_tmp = history;
-      BaseFloat prob = 1.0 / histories_.size();
+      BaseFloat prob = 1.0 / histories.size();
       while (h_tmp.size() > (history.size() - i)) {
         HistType::iterator last = h_tmp.end() - 1;
         HistType h(h_tmp.begin(), last);
@@ -132,37 +138,39 @@ void ArpaSampling::ComputeHistoriesWeights() {
       }
       HistType::iterator begin = history.begin() + i;
       HistType h(begin, history.end());
-      hists_weights_[h] += prob;
+      hists_weights[h] += prob;
     }
   }
+  return hists_weights;
 }
 
-// Get weighted pdf
-void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* pdf_w) {
+// Get weighted pdf given a list of histories
+void ArpaSampling::ComputeWeightedPdf(HistWeightsType hists_weights, 
+    std::vector<std::pair<int32, BaseFloat> >* pdf_w) {
   BaseFloat prob = 0;
   (*pdf_w).clear();
-  (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault
+  (*pdf_w).resize(num_words_);
   for (int32 i = 0; i < num_words_; i++) {
-    for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) {
+    for (HistWeightsType::const_iterator it = hists_weights.begin(); 
+        it != hists_weights.end(); ++it) {
       HistType h(it->first);
       int32 order = h.size();
-      auto it_hist = probs_[order].find(h);
+      NgramType::const_iterator it_hist = probs_[order].find(h);
       if (it_hist != probs_[order].end()) {
         int32 word = vocab_[i].second;
-        auto it_word = probs_[order][h].find(word);
+        WordToProbsMap::const_iterator it_word = probs_[order][h].find(word);
         if (it_word != probs_[order][h].end()) {
           if (order > 0) {
             HistType::iterator last = h.end() - 1;
             HistType::iterator first = h.begin() + 1;
             HistType h1(h.begin(), last);
             HistType h2(first, h.end());
-            prob = (it->second) * (pow(10, probs_[order][h][word].first) - 
-                    pow(10, GetBackoffWeight(order, h.back(), h1))
-                    * pow(10, GetProb(order, word, h2)));
+            prob = it->second * (pow(10, probs_[order][h][word].first) - 
+                    pow(10, GetBackoffWeight(order, h.back(), h1) + GetProb(order, word, h2)));
             (*pdf_w)[i].first = word;
             (*pdf_w)[i].second += prob;
           } else {
-            prob = (it->second) * pow(10, probs_[order][h][word].first);
+            prob = it->second * pow(10, probs_[order][h][word].first);
             (*pdf_w)[i].first = word;
             (*pdf_w)[i].second += prob;
           }
@@ -172,10 +180,42 @@ void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >*
   } // end reading words
 }
 
-void ArpaSampling::RandomGenerateHistories() {
-  // clear previous histories
-  histories_.clear();
-  // randomly generate histories
+// this function compute words existing for given histories and their corresponding
+// probabilities
+void ArpaSampling::ComputeOutputWords(std::vector<HistType> histories,
+    unordered_map<int32, BaseFloat>* pdf_w) {
+  HistWeightsType hists_weights = ComputeHistoriesWeights(histories); 
+  BaseFloat prob = 0;
+  for (HistWeightsType::const_iterator it = hists_weights.begin(); it != hists_weights.end(); ++it) {
+    HistType h(it->first);
+    int32 order = h.size();
+    NgramType::const_iterator it_hist = probs_[order].find(h);
+    if (it_hist != probs_[order].end()) {
+      for(WordToProbsMap::const_iterator it_word = probs_[order][h].begin(); 
+          it_word != probs_[order][h].end(); ++it_word) {
+        int32 word = it_word->first;
+        if (order > 0) {
+          HistType::iterator last = h.end() - 1;
+          HistType::iterator first = h.begin() + 1;
+          HistType h1(h.begin(), last);
+          HistType h2(first, h.end());
+          prob = it->second * (pow(10, probs_[order][h][word].first) - 
+                  pow(10, GetBackoffWeight(order, h.back(), h1) + GetProb(order, word, h2)));
+          unordered_map<int32, BaseFloat>::iterator map_it = (*pdf_w).find(word);
+          if (map_it != (*pdf_w).end()) {
+            (*pdf_w)[word] += prob;
+          } else {
+            (*pdf_w).insert({word, prob});
+          }
+        }
+      }
+    }
+  }
+}
+
+// this function randomly generate 5 - 1005 histories 
+std::vector<HistType> ArpaSampling::RandomGenerateHistories() {
+  std::vector<HistType> histories;
   int32 num_histories = rand() % 1000 + 5; // generate at least 5 histories
   for (int32 i = 0; i < num_histories; i++) {
     HistType hist;
@@ -188,18 +228,20 @@ void ArpaSampling::RandomGenerateHistories() {
       KALDI_ASSERT(word > 0 && word <= vocab_.size());
       hist.push_back(word);
     }
-    histories_.push_back(hist);
+    histories.push_back(hist);
   }
+  return histories;
 }
 
 // this function checks the two estimated pdfs from 1) weighted history 
 // and 2) normal computation are the same
 void ArpaSampling::TestPdfsEqual() {
-  RandomGenerateHistories();
-  hists_weights_.clear();
-  ComputeHistoriesWeights();
+  std::vector<HistType> histories;
+  histories = RandomGenerateHistories();
+  HistWeightsType hists_weights;
+  hists_weights = ComputeHistoriesWeights(histories);
   std::vector<std::pair<int32, BaseFloat> > pdf_hist_weight;
-  ComputeWeightedPdf(&pdf_hist_weight);
+  ComputeWeightedPdf(hists_weights, &pdf_hist_weight);
   // check the averaged pdf sums to 1
   BaseFloat sum = 0;
   for (int32 i = 0; i < num_words_; i++) {
@@ -209,12 +251,12 @@ void ArpaSampling::TestPdfsEqual() {
   // get the average pdf
   std::vector<std::pair<int32, BaseFloat> > pdf;
   pdf.resize(num_words_);
-  for (int32 i = 0; i < histories_.size(); i++) {
+  for (int32 i = 0; i < histories.size(); i++) {
     std::vector<std::pair<int32, BaseFloat> > pdf_h;
-    ComputeWordPdf(histories_[i], &pdf_h);
+    ComputeWordPdf(histories[i], &pdf_h);
     for(int32 j = 0; j < pdf_h.size(); j++) {
       pdf[j].first = pdf_h[j].first;
-      pdf[j].second += pdf_h[j].second / histories_.size();
+      pdf[j].second += pdf_h[j].second / histories.size();
     }
   }
   // check the averaged pdf sums to 1
@@ -231,74 +273,19 @@ void ArpaSampling::TestPdfsEqual() {
   KALDI_ASSERT(ApproxEqual(diff, 0.0));
 }
 
-// this function returns the log probability of the given sentence
-BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector<int32>& sentence) {
-  BaseFloat prob = 0;
-  const fst::SymbolTable* sym = Symbols();
-  for (int32 i = 1; i < sentence.size(); i++) {
-    if (i < ngram_order_ - 1) {
-      HistType::const_iterator last = sentence.begin() + i;
-      HistType h(sentence.begin(), last);
-      prob += GetProb(i + 1, sentence[i], h);
-    } else {
-      HistType::const_iterator first = sentence.begin() + i + 1 - ngram_order_;
-      HistType::const_iterator last = sentence.begin() + i;
-      HistType h(first, last);
-      KALDI_ASSERT(h.size() == ngram_order_ - 1);
-      prob += GetProb(ngram_order_, sentence[i], h);
-    }
-    std::string word_s = sym->Find(sentence[i]);
-    if (sentence[i] == kUnk) {
-      word_s = unk_symbol_;
-    }
-  }
-  return prob;
-}
-
-// this functions computes the total log probability of all test sentences
-BaseFloat ArpaSampling::ComputeAllSentencesProb(const std::vector<std::vector<int32> >& sentences) {
-  BaseFloat prob = 0;
-  for (int32 i = 0; i < sentences.size(); i++) {
-    KALDI_ASSERT(sentences[i].size() >= 3);
-    prob += ComputeSentenceProb(sentences[i]);
-  }
-  int32 len = sentences.size();
-  KALDI_LOG << "Total log-probabilities of " << len << " sentences are: "\
-    << prob;
-  return prob;
-}
-
-void ArpaSampling::PrintHist(const HistType& h) {
-  KALDI_LOG << "Current hist is: ";
-  for (int32 i = 0; i < h.size(); i++) {
-    KALDI_LOG << h[i] << " ";
-  }
-}
-
-// Test the read-in model by computing the total prob of given sentences
-void ArpaSampling::TestProbs(std::istream &is, bool binary) {
-  std::vector<std::vector<int32> > sentences;
-  ReadSentences(is, &sentences);
-  ComputeAllSentencesProb(sentences);
-}
-
 // Test the read-in language model
 void ArpaSampling::TestReadingModel() {
   KALDI_LOG << "Testing model reading part..."<< std::endl;
   KALDI_LOG << "Vocab size is: " << vocab_.size();
-  std::cout << "Print out vocab: " << std::endl;
-  for (int i = 0; i < vocab_.size(); i++) {
-    std::cout << i << " , " << vocab_[i].first << " , " << vocab_[i].second << std::endl;
-  }
   KALDI_LOG << "Ngram_order is: " << ngram_order_;
   KALDI_ASSERT(probs_.size() == ngram_counts_.size());
   for (int32 i = 0; i < ngram_order_; i++) {
     int32 size_ngrams = 0;
     KALDI_LOG << "Test: for order " << (i + 1);
     KALDI_LOG << "Expected number of " << (i + 1) << "-grams: " << ngram_counts_[i];
-    for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) {
+    for (NgramType::const_iterator it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) {
       HistType h(it1->first);
-      for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) {
+      for (WordToProbsMap::const_iterator it2 = probs_[i][h].begin(); it2 != probs_[i][h].end(); ++it2) {
         size_ngrams++; // number of words given
       }
     }
@@ -307,10 +294,10 @@ void ArpaSampling::TestReadingModel() {
   KALDI_LOG << "Assert sum of unigram probs equal to 1...";
   BaseFloat prob_sum = 0.0;
   int32 count = 0;
-  for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) {
+  for (NgramType::const_iterator it1 = probs_[0].begin(); it1 != probs_[0].end();++it1) {
     HistType h(it1->first);
-    for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) {
-      prob_sum += 1.0 * pow(10.0, (it2->second).first);
+    for (WordToProbsMap::const_iterator it2 = probs_[0][h].begin(); it2 != probs_[0][h].end(); ++it2) {
+      prob_sum += 1.0 * pow(10.0, it2->second.first);
       count++;
     }
   }
@@ -319,12 +306,12 @@ void ArpaSampling::TestReadingModel() {
 
   KALDI_LOG << "Assert sum of bigram probs given a history equal to 1...";
   prob_sum = 0.0;
-  auto it1 = probs_[1].begin();
+  NgramType::const_iterator it1 = probs_[1].begin();
   HistType h(it1->first);
   for (int32 i = 0; i < num_words_; i++) {
-    auto it2 = probs_[1][h].find(vocab_[i].second);
+    WordToProbsMap::const_iterator it2 = probs_[1][h].find(vocab_[i].second);
     if (it2 != probs_[1][h].end()) {
-      prob_sum += 1.0 * pow(10, (it2->second).first);
+      prob_sum += 1.0 * pow(10, it2->second.first);
     } else {
       prob_sum += pow(10, GetProb(2, vocab_[i].second, h));
     }
@@ -332,39 +319,17 @@ void ArpaSampling::TestReadingModel() {
   KALDI_LOG << "Sum of bigram probs given a history equal to " << prob_sum;
 }
 
-// Read sentences from a file
-void ArpaSampling::ReadSentences(std::istream &iss, std::vector<std::vector<int32> >* sentences) {
-  const fst::SymbolTable* sym = Symbols();
-  std::string line;
-  KALDI_LOG << "Start reading sentences...";
-  while (getline(iss, line)) {
-    std::istringstream is(line);
-    std::istream_iterator<std::string> begin(is), end;
-    std::vector<std::string> tokens(begin, end);
-    std::vector<int32> sentence;
-    int32 word;
-    int32 bos = sym->Find(bos_symbol_);
-    sentence.push_back(bos);
-    for (int32 i = 0; i < tokens.size(); i++) {
-      word = sym->Find(tokens[i]);
-      if (word == fst::SymbolTable::kNoSymbol) {
-        word = sym->Find(unk_symbol_);
-      }
-      sentence.push_back(word);
-    }
-    int32 eos = sym->Find(eos_symbol_);
-    sentence.push_back(eos);
-    (*sentences).push_back(sentence);
-  }
-  KALDI_LOG << "Finished reading sentences.";
+int32 ArpaSampling::GetNgramOrder() {
+  return ngram_order_;
 }
 
 // Read histories of integers from a file
-void ArpaSampling::ReadHistories(std::istream &is, bool binary) {
+std::vector<HistType> ArpaSampling::ReadHistories(std::istream &is, bool binary) {
   if (binary) {
     KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser";
   }
   const fst::SymbolTable* sym = Symbols();
+  std::vector<HistType> histories;
   std::string line;
   KALDI_LOG << "Start reading histories from file...";
   while (getline(is, line)) {
@@ -381,13 +346,14 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) {
       history.push_back(word);
     }
     if (history.size() >= ngram_order_) {
-      std::reverse(history.begin(), history.end());
-      history.resize(ngram_order_ - 1);
-      std::reverse(history.begin(), history.end());
+      HistType h(history.end() - ngram_order_ + 1, history.end());
+      history.clear();
+      HistType history = h;
     }
-    histories_.push_back(history);
+    histories.push_back(history);
   }
   KALDI_LOG << "Finished reading histories from file.";
+  return histories;
 }
 
 } // end of kaldi
diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h
index 1fdedeb573a..5f80ca308a5 100644
--- a/src/rnnlm/arpa-sampling.h
+++ b/src/rnnlm/arpa-sampling.h
@@ -24,6 +24,7 @@
 #include <unistd.h>
 #include "lm/arpa-file-parser.h"
 #include "fst/fstlib.h"
+#include "util/common-utils.h"
 
 #include <stdlib.h>
 #include <math.h>
@@ -39,32 +40,16 @@ namespace kaldi {
 
 typedef int32_t int32;
 
-/// A hashing function-object for vectors of ints.
-struct IntVectorHasher {  // hashing function for vector<Int>.
-  size_t operator()(const std::vector<int32> &x) const {
-    size_t ans = 0;
-    typename std::vector<int32>::const_iterator iter = x.begin(), end = x.end();
-    for (; iter != end; ++iter) {
-      ans *= kPrime;
-      ans += *iter;
-    }
-    return ans;
-  }
- private:
-  static const int kPrime = 7853;
-};
-
-// Predefine some symbol values, because any integer is as good than any other.
 enum {
   kEps = 0,
-  // kDisambig,
+  kDisambig,
   kBos, kEos, kUnk
 };
 
 typedef std::vector<int32> HistType;
 typedef unordered_map<int32, std::pair<BaseFloat, BaseFloat> > WordToProbsMap; 
-typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType;
-typedef unordered_map<HistType, BaseFloat, IntVectorHasher> HistWeightsType;
+typedef unordered_map<HistType, WordToProbsMap, VectorHasher<int32> > NgramType;
+typedef unordered_map<HistType, BaseFloat, VectorHasher<int32> > HistWeightsType;
 
 class ArpaSampling : public ArpaFileParser {
  public:
@@ -77,25 +62,34 @@ class ArpaSampling : public ArpaFileParser {
        eos_symbol_ = "</s>";
        unk_symbol_ = "<unk>";
   }
-  // Compute the probability of a given sentence with ngram_order LM
-  BaseFloat ComputeSentenceProb(const std::vector<int32>& test_sentence);
   
-  // Test the read-in model by computing probs of all sentences with ngram_order LM
-  BaseFloat ComputeAllSentencesProb(const std::vector<std::vector<int32> >& test_sentences);
+  // This function returns the log probability of a ngram term from the ARPA LM
+  // if it is found; it backoffs to the lower order model when the ngram term 
+  // does not exist.
+  BaseFloat GetProb(int32 order, int32 word, const HistType& history);
+
+  // Get the back-off weight of a ngram in the read-in model
+  BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history);
+
+  // Compute non-unigram output words and corresponding probs for given histories
+  void ComputeOutputWords(std::vector<HistType> histories,
+      unordered_map<int32, BaseFloat>* pdf_w);
+  
+  // Compute weighted pdf given all histories
+  void ComputeWeightedPdf(HistWeightsType hists_weights, 
+      std::vector<std::pair<int32, BaseFloat> >* weighted_pdf);
   
+  // Get ngram order 
+  int32 GetNgramOrder();
+
   void TestReadingModel();
 
   void TestProbs(std::istream &is, bool binary);
 
   void TestPdfsEqual();
 
-  // print history
-  void PrintHist(const HistType& h);
-  
-  void ReadHistories(std::istream &is, bool binary);
+  std::vector<HistType> ReadHistories(std::istream &is, bool binary);
 
-  void ReadSentences(std::istream &is, std::vector<std::vector<int32> >* sentences);
-  
  protected:
   // ArpaFileParser overrides.
   virtual void HeaderAvailable(); 
@@ -103,26 +97,16 @@ class ArpaSampling : public ArpaFileParser {
   virtual void ReadComplete() {}
 
  private:
-  // This function returns the log probability of a ngram term from the ARPA LM
-  // if it is found; it backoffs to the lower order model when the ngram term 
-  // does not exist.
-  BaseFloat GetProb(int32 order, int32 word, const HistType& history);
-
-  // Get the back-off weight of a ngram in the read-in model
-  BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history);
-  
   // For test: randomly generate histories
-  void RandomGenerateHistories();
+  std::vector<HistType> RandomGenerateHistories();
 
   // Compute a pdf of words in the vocab given a history
-  void ComputeWordPdf(const HistType& history, std::vector<std::pair<int32, BaseFloat> >* pdf);
+  void ComputeWordPdf(const HistType& history, 
+      std::vector<std::pair<int32, BaseFloat> >* pdf);
   
   // Compute weights of given histories
-  void ComputeHistoriesWeights();
+  HistWeightsType ComputeHistoriesWeights(std::vector<HistType> histories);
 
-  // Compute weighted pdf given all histories
-  void ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* weighted_pdf);
-  
   // N-gram order of the read-in LM.
   int32 ngram_order_;
   
@@ -150,9 +134,6 @@ class ArpaSampling : public ArpaFileParser {
   // Histories' weights
   HistWeightsType hists_weights_;
   
-  // The given N Histories
-  std::vector<HistType> histories_;
-  
   // Test sentences 
   std::vector<std::vector<int32> > sentences_;
 };
diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc
index 0f5673b2035..b081d878512 100644
--- a/src/rnnlm/rnnlm-utils-test.cc
+++ b/src/rnnlm/rnnlm-utils-test.cc
@@ -204,16 +204,21 @@ int main(int argc, char **argv) {
   mdl.TestReadingModel();
    
   Input k2(history_file, &binary);
-  mdl.ReadHistories(k2.Stream(), binary);
+  std::vector<HistType> histories;
+  histories = mdl.ReadHistories(k2.Stream(), binary);
+  unordered_map<int32, BaseFloat> pdf_hist_weight;
+  mdl.ComputeOutputWords(histories, &pdf_hist_weight);
   // command for running the test binary: ./test-binary arpa-file history-file
   // arpa-file is the ARPA-format language model
   // history-file has lines of histories, one history per line
 
   // this test can be slow
+  /*
   KALDI_LOG << "Start weighted histories test...";
   for (int i = 0; i < N / 100; i++) {
     mdl.TestPdfsEqual(); 
   }
   KALDI_LOG << "Successfuly pass the test.";
+  */
   return 0;
 }