From 62c48b20af620ac8591ac1c7655e618dd453dbef Mon Sep 17 00:00:00 2001 From: Ke Li Date: Mon, 23 Jan 2017 15:58:57 -0500 Subject: [PATCH 1/5] sample a word version1 (IO is written by myself) --- src/rnnlm/sample_a_word.cc | 432 +++++++++++++++++++++++++++++++++++++ src/rnnlm/sample_a_word.h | 159 ++++++++++++++ 2 files changed, 591 insertions(+) create mode 100644 src/rnnlm/sample_a_word.cc create mode 100644 src/rnnlm/sample_a_word.h diff --git a/src/rnnlm/sample_a_word.cc b/src/rnnlm/sample_a_word.cc new file mode 100644 index 00000000000..8761291d525 --- /dev/null +++ b/src/rnnlm/sample_a_word.cc @@ -0,0 +1,432 @@ +// sample_a_word.cc + +#include "sample_a_word.h" +#include +#include +#include +#include +#include + +// Constructor for sampling the next word +NgramModel::NgramModel(char* arpa_file, char* histories_file) { + vocab_size_ = 0; + ReadARPAModel(arpa_file); + ReadHistories(histories_file); +} + +// Read language model from a ARPA-format file. +void NgramModel::ReadARPAModel(char* file) { + std::ifstream data_input(file); + if (!data_input.is_open()) { + std::cerr << "error opening '" << file + << "' for reading\n"; + exit(1); + } + std::string line; + int32 order; + int32 order_current = 0; + int32 word; + int32 iter = 0; + int32 while_iter = 0; + std::pair probs_pair; + float log_prob; + float backoff_weight; + bool unigram_check = false; + std::cout << "Start reading ARPA-format file..." << std::endl; + while (getline(data_input, line)) { + std::istringstream is(line); + // get the strings splitted by single space + // brace-initialization with C++11 + std::istream_iterator begin(is), end; + std::vector tokens(begin, end); + if (tokens.size() == 0) continue; + if (tokens.size() == 2 && tokens[0] == "ngram") { + std::string substring = tokens[1].substr(2); + int32 count = std::stoi(substring); // get "123456" from "1=123456" + counts_.push_back(count); + order = std::stoi(tokens[1].substr(0)); + continue; + } + if (tokens.size() == 1 && tokens[0] == "\\1-grams:") { + ngram_order_ = order; // ngram_order + probs_.resize(ngram_order_); + std::cout << "Ngram order is: " << ngram_order_ << std::endl; + } + // read current order + if (tokens.size() == 1 && tokens[0] != "\\data\\" && + tokens[0] != "\\end\\") { + order_current = std::stoi(tokens[0].substr(1,1)); + continue; // get the order info and skip processing this line + } + // read vocab and initialize probs of unigrams + if (order_current == 1) { + std::string word_s; + if (tokens.back() != "") { + word_s = tokens.end()[-2]; + backoff_weight = std::stof(tokens.back()); + } else { + word_s = tokens.back(); + backoff_weight = 0; + } + word = iter; + vocab_.insert({word_s, word}); + iter++; + vocab_size_++; + if (iter == counts_[0]) { + bool unigram_check = true; + std::cout << "vocab size: " << vocab_size_ << std::endl; + } + HistType history; + history.resize(0); + log_prob = std::stof(tokens[0]); + probs_pair = std::make_pair(log_prob, backoff_weight); + probs_[order_current - 1][history].insert({word, probs_pair}); + continue; + } + // read each ngram and its log-probs and back-off weights + // read probs of order 1 to N - 1 + if (order_current < ngram_order_ && order_current > 1) { + // case1: backoff_weights exist + if ((tokens.size() > order_current + 1) && (tokens.back() != "") && tokens[0] != "ngram") { + // get the integer for word, the last second string in tokens + std::string second_last = tokens.end()[-2]; + unordered_map::iterator it = vocab_.find(second_last); + if (it != vocab_.end()) { + word = it->second; + } else { + std::cout << "OOV word found: " << tokens.end()[-2] << std::endl; + } + int32 len_hist = tokens.size() - 3; // exclude the word, log-prob, and bow + HistType history; + for (int32 i = 1; i < len_hist + 1; i++) { + unordered_map::iterator it = vocab_.find(tokens[i]); + if (it != vocab_.end()) { + history.push_back(it->second); + } else { + std::cout << "OOV found in history: " << tokens[i] << std::endl; + } + } + assert (history.size() == order_current - 1); + log_prob = std::stof(tokens[0]); + backoff_weight = std::stof(tokens.back()); + probs_pair = std::make_pair(log_prob, backoff_weight); + probs_[order_current - 1][history].insert({word, probs_pair}); + continue; + } + // case2: no backoff_weights + if (tokens.size() == order_current + 1 && (tokens.back() == "") && tokens[0] != "ngram") { + unordered_map::iterator it = vocab_.find(tokens.back()); + if (it != vocab_.end()) { + word = it->second; + } + int32 len_hist = tokens.size() - 2; // exclude the word and log-prob + HistType history; + assert (len_hist > 0); + for (int32 i = 1; i < len_hist + 1; i++) { + unordered_map::iterator it = vocab_.find(tokens[i]); + if (it != vocab_.end()) { + history.push_back(it->second); + } else { + std::cout << "OOV found in history: " << tokens[i] << std::endl; + } + } + assert (history.size() == order_current - 1); + log_prob = std::stof(tokens[0]); + backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff) + probs_pair = std::make_pair(log_prob, backoff_weight); + probs_[order_current - 1][history].insert({word, probs_pair}); + continue; + } + } else if (order_current == ngram_order_) { // read probs of order N + if (tokens.size() > 2) { + std::string word_s = tokens.back(); + unordered_map::iterator it = vocab_.find(word_s); + if (it != vocab_.end()) { + word = it->second; + } + int32 len_hist = tokens.size() - 2; // exclude the word and log-prob + HistType history; + assert (len_hist > 0); + for (int32 i = 1; i < len_hist + 1; i++) { + unordered_map::iterator it = vocab_.find(tokens[i]); + if (it != vocab_.end()) { + history.push_back(it->second); + } else { + std::cout << "OOV found in history: " << tokens[i] << std::endl; + } + } + log_prob = std::stof(tokens[0]); + backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff) + probs_pair = std::make_pair(log_prob, backoff_weight); + probs_[order_current - 1][history].insert({word, probs_pair}); + continue; + } + } + } + std::cout << "Finish reading ARPA-format file." << std::endl; +} + +float NgramModel::GetProb(int32 order, const int32 word, const HistType& history) { + float prob = 0.0; + auto it = probs_[order - 1].find(history); + if (it != probs_[order - 1].end() && + probs_[order-1][history].find(word) != probs_[order-1][history].end()) { + prob += probs_[order-1][history][word].first; + } else { // backoff to the previous order + order--; + if (order >= 1) { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + prob += GetProb(order, word, h); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob += GetBackoffWeight(order, word_new, h_new); + } + } + return prob; +} + +float NgramModel::GetBackoffWeight(int32 order, const int32 word, const HistType& history) { + float bow = 0.0; + auto it = probs_[order - 1].find(history); + if (it != probs_[order - 1].end()) { + auto it2 = probs_[order - 1][history].find(word); + if (it2 != probs_[order - 1][history].end()) { + bow = (it2->second).second; + } + } + return bow; +} + +void NgramModel::ComputeWordPdf(const HistType& history, std::vector* pdf) { + int32 order = history.size(); + float prob = 0.0; + for (int32 i = 0; i < vocab_size_; i++) { + auto it = probs_[order].find(history); + int32 word = i; + if (it != probs_[order].end()) { + auto it2 = probs_[order][history].find(word); + if (it2 != probs_[order][history].end()) { + prob = pow(10, (it2->second).first); + (*pdf).push_back(prob); + } else { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * + pow(10, GetProb(order, word, h)); + (*pdf).push_back(prob); + } + } else { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * + pow(10, GetProb(order, word, h)); + (*pdf).push_back(prob); + } + } +} + +// Get history weights +void NgramModel::ComputeHistoriesWeights() { + for (auto it = histories_.begin(); it != histories_.end(); ++it) { + HistType history(*(it)); + assert(history.size() <= ngram_order_); + for (int32 i = 0; i < history.size() + 1; i++) { + HistType h_tmp = history; + float prob = 1.0 / histories_.size(); + while (h_tmp.size() > (history.size() - i)) { + HistType::iterator last = h_tmp.end() - 1; + HistType h(h_tmp.begin(), last); + int32 word = h_tmp.back(); + prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h)); + h_tmp = h; + } + HistType::iterator begin = history.begin() + i; + HistType h(begin, history.end()); + hists_weights_[h] += prob; + } + } + std::cout << "Size of hists_weights_ is: " << hists_weights_.size() << std::endl; +} + +// Get weighted pdf +void NgramModel::ComputeWeightedPdf(std::vector* pdf_w) { + float prob = 0; + (*pdf_w).resize(vocab_size_); // if do not do this, (*pdf_w)[word] += prob will get seg fault + for (int32 i = 0; i < vocab_size_; i++) { + for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) { + HistType h(it->first); + int32 order = h.size(); + auto it_hist = probs_[order].find(h); + if (it_hist != probs_[order].end()) { + int32 word = i; + auto it_word = probs_[order][h].find(word); + if (it_word != probs_[order][h].end()) { + if (order > 0) { + HistType::iterator last = h.end() - 1; + HistType::iterator first = h.begin() + 1; + HistType h1(h.begin(), last); + HistType h2(first, h.end()); + prob = (it->second) * (pow(10, probs_[order][h][word].first) - + pow(10, GetBackoffWeight(order, h.back(), h1)) + * pow(10, GetProb(order, word, h2))); + (*pdf_w)[word] += prob; + } + else { + prob = (it->second) * pow(10, probs_[order][h][word].first); + (*pdf_w)[word] += prob; + } + } + } + } // end reading history + } // end reading words +} + +// sample a word that follows a pdf +int32 NgramModel::SampleWord(const std::vector& pdf) { + // generate a cdf from the given pdf + std::vector > cdf; + float upper = 0; + float lower = 0; + std::pair probs; + for (int32 i = 0; i < pdf.size(); i++) { + upper += pdf[i]; + lower = upper - pdf[i]; + probs = std::make_pair(lower, upper); + cdf.push_back(probs); + } + float u = 1.0 * rand()/RAND_MAX; + for (int32 i = 0; i < cdf.size(); i++) { + if (cdf[i].first <= u < cdf[i].second) { + return i; + } + } +} + +// Sampling a word +void NgramModel::TestSampling(int32 iters) { + ComputeHistoriesWeights(); + std::vector pdf; + ComputeWeightedPdf(&pdf); + + // Compute diff + std::vector pdf_est; + pdf_est.resize(vocab_size_); + int32 word; + int32 count_nons = 0; + for (int32 i = 0; i < iters; i++) { + word = SampleWord(pdf); + if (word > vocab_size_ || word < 0) { + std::cout << "the next word is " << word << std::endl; + count_nons += 1; + continue; + } else { + pdf_est[word] += 1.0; + } + } + // normalization + float ed = 0; + for (int32 i = 0; i < vocab_size_; i++) { + pdf_est[word] /= iters; + ed += pow(pdf_est[word] - pdf[word], 2); + } + ed = pow(ed, 0.5); + std::cout << "Run " << iters << " times, e distance (expect < 0.05) is " << ed << std::endl; + std::cout << "Number of words OOV : " << count_nons << std::endl; +} + +// Test the read-in language model +void NgramModel::TestReadingModel() { + std::cout << "Testing model reading part..."<< std::endl; + std::cout << "Vocab size is: " << vocab_size_ << std::endl; + std::cout << "Ngram_order is: " << ngram_order_ << std::endl; + assert(probs_.size() == counts_.size()); + for (int32 i = 0; i < ngram_order_; i++) { + int32 size_ngrams = 0; + std::cout << "Test: for order " << (i + 1) << std::endl; + std::cout << "Expected number of " << (i + 1) << "-grams: " << counts_[i] << std::endl; + for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { + HistType h(it1->first); + for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) { + size_ngrams++; // number of words given + } + } + std::cout << "Read in number of " << (i + 1) << "-grams: " << size_ngrams << std::endl; + } + std::cout << "Assert sum of unigram probs equal to 1..." << std::endl; + float prob_sum = 0.0; + int32 count = 0; + for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) { + HistType h(it1->first); + for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) { + prob_sum += 1.0 * pow(10.0, (it2->second).first); + count++; + } + } + std::cout << "Number of total words: " << count << std::endl; + std::cout << "Sum of unigram probs equal to " << prob_sum << std::endl; + + std::cout << "Assert sum of bigram probs given a history equal to 1..." << std::endl; + prob_sum = 0.0; + auto it1 = probs_[1].begin(); + HistType h(it1->first); + for (auto it = vocab_.begin(); it != vocab_.end(); ++it) { + auto it2 = probs_[1][h].find(it->second); + if (it2 != probs_[1][h].end()) { + prob_sum += 1.0 * pow(10, (it2->second).first); + } else { + prob_sum += pow(10, GetProb(2, it->second, h)); + } + } + std::cout << "Sum of bigram probs given a history equal to " << prob_sum << std::endl; + +} + +// Read histories of integers from a file +void NgramModel::ReadHistories(char* file) { + std::ifstream data_input(file); + if (!data_input.is_open()) { + std::cerr << "error opening '" << file + << "' for reading\n"; + exit(1); + } + std::string line; + std::cout << "Start reading histories..." << std::endl; + while (getline(data_input, line)) { + std::istringstream is(line); + std::istream_iterator begin(is), end; + std::vector tokens(begin, end); + HistType history; + int32 word; + for (int32 i = 0; i < tokens.size(); i++) { + auto it = vocab_.find(tokens[i]); + if (it != vocab_.end()) { + word = it->second; + } else { + std::string word_s = ""; + auto it_unk = vocab_.find(word_s); + assert (it_unk != vocab_.end()); + word = it_unk->second; + } + history.push_back(word); + } + if (history.size() >= ngram_order_) { + // TODO: try slicing it later + std::reverse(history.begin(), history.end()); + history.resize(ngram_order_ - 1); + std::reverse(history.begin(), history.end()); + } + histories_.push_back(history); + } + std::cout << "Finished reading histories." << std::endl; +} diff --git a/src/rnnlm/sample_a_word.h b/src/rnnlm/sample_a_word.h new file mode 100644 index 00000000000..86fa4e1d4ee --- /dev/null +++ b/src/rnnlm/sample_a_word.h @@ -0,0 +1,159 @@ +// sample_a_word.h + +// Copyright 2016 Ke Li + +// See ../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABILITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef SAMPLE_A_WORD_H_ +#define SAMPLE_A_WORD_H_ + +#include +#include + +#ifdef _MSC_VER +#include +#include +using std::unordered_map; +using std::unordered_set; +#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) +#include +#include +using std::unordered_map; +using std::unordered_set; +#else +#include +#include +using std::tr1::unordered_map; +using std::tr1::unordered_set; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef int32_t int32; + +/// A hashing function-object for vectors of ints. +struct IntVectorHasher { // hashing function for vector. + size_t operator()(const std::vector &x) const { + size_t ans = 0; + typename std::vector::const_iterator iter = x.begin(), end = x.end(); + for (; iter != end; ++iter) { + ans *= kPrime; + ans += *iter; + } + return ans; + } + private: + static const int kPrime = 7853; +}; + +typedef std::vector HistType; +typedef unordered_map > WordToProbsMap; +typedef unordered_map NgramType; +typedef unordered_map HistWeightsType; + +class Timer { + public: + Timer() { Reset(); } + + void Reset() { gettimeofday(&this->time_start_, &time_zone_); } + + /// Returns time in seconds. + double Elapsed() { + struct timeval time_end; + gettimeofday(&time_end, &time_zone_); + double t1, t2; + t1 = static_cast(time_start_.tv_sec) + + static_cast(time_start_.tv_usec)/(1000*1000); + t2 = static_cast(time_end.tv_sec) + + static_cast(time_end.tv_usec)/(1000*1000); + return t2-t1; + } + + private: + struct timeval time_start_; + struct timezone time_zone_; +}; + +class NgramModel { + public: + // Constructor for testing + NgramModel(char* arpa_file, char* histories_file); + + void TestReadingModel(); + + void TestSampling(int32 iters); + + private: + // This function returns the log probability of a ngram term from the ARPA LM + // if it is found; it backoffs to the lower order model when the ngram term + // does not exist. + float GetProb(int32 order, const int32 word, const HistType& history); + + // Get the back-off weight of a ngram in the read-in model + float GetBackoffWeight(int32 order, const int32 word, const HistType& history); + + // Compute a pdf of words in the vocab given a history + void ComputeWordPdf(const HistType& history, std::vector* pdf); + + // Compute weights of given histories + void ComputeHistoriesWeights(); + // Compute weighted pdf given all histories + void ComputeWeightedPdf(std::vector* weighted_pdf); + + // Sample the next word + int32 SampleWord(const std::vector& pdf); + + // Read the language model prob_ from stream + // Called from constructor; Check the sum of unigrams + void ReadARPAModel(char* arpa_file); + + void ReadHistories(char* file); + + // N-gram order of the read-in LM. + int32 ngram_order_; + + // Counts of each ngram + std::vector counts_; + + // Vocab size + int32 vocab_size_; + + // Vocab + unordered_map vocab_; + + // N-gram probabilities. + std::vector probs_; + + // Histories' weights + HistWeightsType hists_weights_; + + // The given N Histories + std::vector histories_; +}; + +#endif From 7ae1bff644f12e3c1bd8f8302c3a93889fc0c2e2 Mon Sep 17 00:00:00 2001 From: Ke Li Date: Tue, 7 Feb 2017 22:36:17 -0500 Subject: [PATCH 2/5] sample a word version 2 (use fst Symbol table and kaldi io) --- src/rnnlm/Makefile | 7 +- src/rnnlm/arpa-sampling.cc | 402 ++++++++++++++++++++++++++++++++++ src/rnnlm/arpa-sampling.h | 182 +++++++++++++++ src/rnnlm/rnnlm-utils-test.cc | 38 +++- 4 files changed, 622 insertions(+), 7 deletions(-) create mode 100644 src/rnnlm/arpa-sampling.cc create mode 100644 src/rnnlm/arpa-sampling.h diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile index bd94149bdfa..48b49d61efb 100644 --- a/src/rnnlm/Makefile +++ b/src/rnnlm/Makefile @@ -10,15 +10,14 @@ TESTFILES = rnnlm-utils-test OBJFILES = rnnlm-component-itf.o rnnlm-utils.o rnnlm-nnet.o rnnlm-component.o nnet-parse.o \ rnnlm-training.o \ - rnnlm-diagnostics.o -# rnnlm-utils-test.o -# rnnlm-test-utils.o + rnnlm-diagnostics.o \ + arpa-sampling.o \ LIBNAME = kaldi-rnnlm ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \ ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \ - ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \ + ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../lm/kaldi-lm.a \ ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \ ../tree/kaldi-tree.a ../util/kaldi-util.a ../thread/kaldi-thread.a \ ../matrix/kaldi-matrix.a ../base/kaldi-base.a diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc new file mode 100644 index 00000000000..f512d11fbcb --- /dev/null +++ b/src/rnnlm/arpa-sampling.cc @@ -0,0 +1,402 @@ +// arpa-sampling.cc + +#include "arpa-sampling.h" +#include +#include +#include +#include +#include + +namespace kaldi { + +void ArpaSampling::ConsumeNGram(const NGram& ngram) { + int32 cur_order = ngram.words.size(); + int32 word = ngram.words.back(); // word is the last word in vector words + HistType history(ngram.words.begin(), ngram.words.begin() + cur_order - 1); + KALDI_ASSERT(history.size() == cur_order - 1); + + BaseFloat log_prob = ngram.logprob / M_LN10; + BaseFloat backoff_weight = ngram.backoff / M_LN10; + std::pair probs_pair; + probs_pair = std::make_pair(log_prob, backoff_weight); + // update map + probs_[cur_order - 1][history].insert({word, probs_pair}); + + // get vocab_, the map from word string to integer + const fst::SymbolTable* sym = Symbols(); + if (cur_order == 1) { + num_words_++; + std::string word_s = sym->Find(word); + std::pair word_pair; + word_pair = std::make_pair(word_s, word); + vocab_.push_back(word_pair); + } +} + +void ArpaSampling::HeaderAvailable() { + ngram_counts_ = NgramCounts(); + ngram_order_ = NgramCounts().size(); + probs_.resize(ngram_order_); +} + +BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history) { + BaseFloat prob = 0.0; + auto it = probs_[order - 1].find(history); + if (it != probs_[order - 1].end() && + probs_[order-1][history].find(word) != probs_[order-1][history].end()) { + prob += probs_[order-1][history][word].first; + } else { // backoff to the previous order + order--; + if (order >= 1) { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + prob += GetProb(order, word, h); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob += GetBackoffWeight(order, word_new, h_new); + } + } + return prob; +} + +BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType& history) { + BaseFloat bow = 0.0; + auto it = probs_[order - 1].find(history); + if (it != probs_[order - 1].end()) { + auto it2 = probs_[order - 1][history].find(word); + if (it2 != probs_[order - 1][history].end()) { + bow = (it2->second).second; + } + } + return bow; +} + +void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector >* pdf) { + int32 order = history.size(); + BaseFloat prob = 0.0; + for (int32 i = 0; i < num_words_; i++) { + auto it = probs_[order].find(history); + int32 word = vocab_[i].second; // get word from the map + if (it != probs_[order].end()) { + auto it2 = probs_[order][history].find(word); + if (it2 != probs_[order][history].end()) { + prob = pow(10, (it2->second).first); + (*pdf)[i].first = word; + (*pdf)[i].second += prob; + } else { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * + + pow(10, GetProb(order, word, h)); + (*pdf)[i].first = word; + (*pdf)[i].second += prob; + } + } else { + HistType::const_iterator first = history.begin() + 1; + HistType::const_iterator last = history.end(); + HistType h(first, last); + int32 word_new = history.back(); + HistType::const_iterator last_new = history.end() - 1; + HistType h_new(history.begin(), last_new); + prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * + pow(10, GetProb(order, word, h)); + (*pdf)[i].first = word; + (*pdf)[i].second += prob; + } + } +} + +// Get history weights +void ArpaSampling::ComputeHistoriesWeights() { + for (auto it = histories_.begin(); it != histories_.end(); ++it) { + HistType history(*(it)); + KALDI_ASSERT(history.size() <= ngram_order_); + for (int32 i = 0; i < history.size() + 1; i++) { + HistType h_tmp = history; + BaseFloat prob = 1.0 / histories_.size(); + while (h_tmp.size() > (history.size() - i)) { + HistType::iterator last = h_tmp.end() - 1; + HistType h(h_tmp.begin(), last); + int32 word = h_tmp.back(); + prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h)); + h_tmp = h; + } + HistType::iterator begin = history.begin() + i; + HistType h(begin, history.end()); + hists_weights_[h] += prob; + } + } + KALDI_LOG << "Size of hists_weights_ is: " << hists_weights_.size(); +} + +// Get weighted pdf +void ArpaSampling::ComputeWeightedPdf(std::vector >* pdf_w) { + BaseFloat prob = 0; + (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault + for (int32 i = 0; i < num_words_; i++) { + for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) { + HistType h(it->first); + int32 order = h.size(); + auto it_hist = probs_[order].find(h); + if (it_hist != probs_[order].end()) { + int32 word = vocab_[i].second; + auto it_word = probs_[order][h].find(word); + if (it_word != probs_[order][h].end()) { + if (order > 0) { + HistType::iterator last = h.end() - 1; + HistType::iterator first = h.begin() + 1; + HistType h1(h.begin(), last); + HistType h2(first, h.end()); + prob = (it->second) * (pow(10, probs_[order][h][word].first) - + pow(10, GetBackoffWeight(order, h.back(), h1)) + * pow(10, GetProb(order, word, h2))); + (*pdf_w)[i].first = word; + (*pdf_w)[i].second += prob; + } else { + prob = (it->second) * pow(10, probs_[order][h][word].first); + (*pdf_w)[i].first = word; + (*pdf_w)[i].second += prob; + } + } + } + } // end reading history + } // end reading words +} + +// sample a word that follows a pdf +int32 ArpaSampling::SampleWord(const std::vector >& pdf) { + // generate a cdf from the given pdf + std::vector > cdf; + BaseFloat upper = 0; + int32 word; + std::pair probs; + for (int32 i = 0; i < num_words_; i++) { + upper += pdf[i].second; + word = vocab_[i].second; + probs = std::make_pair(word, upper); + cdf.push_back(probs); + } + BaseFloat u = 1.0 * RandUniform(); + if (u >= 0 && u < cdf[1].second) { + return cdf[0].first; + } + for (int32 i = 1; i < num_words_; i++) { + if (cdf[i - 1].second <= u && u < cdf[i].second) { + return cdf[i].first; + } + } + return -1; +} + +// Sample a word +void ArpaSampling::TestSampling() { + ComputeHistoriesWeights(); + std::vector > pdf; + ComputeWeightedPdf(&pdf); + BaseFloat sum = 0; + for (int32 i = 0; i < num_words_; i++) { + sum += pdf[i].second; + } + + // Check convergence + unordered_map pdf_est; + int32 word; + int32 count_nons = 0; + int32 count = 0; + for (int32 i = 0; ; i++) { + word = SampleWord(pdf); + if (word > num_words_ || word < 0) { + KALDI_LOG << "the next word is " << word; + count_nons += 1; + continue; + } else { + auto it = pdf_est.find(word); + if (it == pdf_est.end()) { + pdf_est.insert({word, 1.0}); + } else { + pdf_est[word] += 1.0; + } + } + count++; + if (count % 1000 == 0) { + // normalization + BaseFloat ed = 0; + for (int32 i = 0; i < num_words_; i++) { + int32 word = vocab_[i].second; + pdf_est[word] /= count; + ed += pow(pdf_est[word] - pdf[i].second, 2); + } + ed = pow(ed, 0.5); + // KALDI_LOG << "Run " << count << " times, Euclidean distance is " << ed; + if (ed <= 0.05) { + KALDI_LOG << "Run " << count << " times, Euclidean distance (expect <= 0.05) is " << ed; + break; + } + } + } + KALDI_LOG << "Number of words OOV : " << count_nons; +} + +// this function returns the log probability of the given sentence +BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector& sentence) { + BaseFloat prob = 0; + const fst::SymbolTable* sym = Symbols(); + for (int32 i = 1; i < sentence.size(); i++) { + if (i < ngram_order_ - 1) { + HistType::const_iterator last = sentence.begin() + i; + HistType h(sentence.begin(), last); + prob += GetProb(i + 1, sentence[i], h); + } else { + HistType::const_iterator first = sentence.begin() + i + 1 - ngram_order_; + HistType::const_iterator last = sentence.begin() + i; + HistType h(first, last); + KALDI_ASSERT(h.size() == ngram_order_ - 1); + prob += GetProb(ngram_order_, sentence[i], h); + } + std::string word_s = sym->Find(sentence[i]); + if (sentence[i] == kUnk) { + word_s = unk_symbol_; + } + } + return prob; +} + +// this functions computes the total log probability of all test sentences +BaseFloat ArpaSampling::ComputeAllSentencesProb(const std::vector >& sentences) { + BaseFloat prob = 0; + for (int32 i = 0; i < sentences.size(); i++) { + KALDI_ASSERT(sentences[i].size() >= 3); + prob += ComputeSentenceProb(sentences[i]); + } + int32 len = sentences.size(); + KALDI_LOG << "Total log-probabilities of " << len << " sentences are: "\ + << prob; + return prob; +} + +void ArpaSampling::PrintHist(const HistType& h) { + KALDI_LOG << "Current hist is: "; + for (int32 i = 0; i < h.size(); i++) { + KALDI_LOG << h[i] << " "; + } +} + +// Test the read-in model by computing the total prob of given sentences +void ArpaSampling::TestProbs(std::istream &is, bool binary) { + std::vector > sentences; + ReadSentences(is, &sentences); + ComputeAllSentencesProb(sentences); +} + +// Test the read-in language model +void ArpaSampling::TestReadingModel() { + KALDI_LOG << "Testing model reading part..."<< std::endl; + KALDI_LOG << "Vocab size is: " << vocab_.size(); + KALDI_LOG << "Ngram_order is: " << ngram_order_; + KALDI_ASSERT(probs_.size() == ngram_counts_.size()); + for (int32 i = 0; i < ngram_order_; i++) { + int32 size_ngrams = 0; + KALDI_LOG << "Test: for order " << (i + 1); + KALDI_LOG << "Expected number of " << (i + 1) << "-grams: " << ngram_counts_[i]; + for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { + HistType h(it1->first); + for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) { + size_ngrams++; // number of words given + } + } + KALDI_LOG << "Read in number of " << (i + 1) << "-grams: " << size_ngrams; + } + KALDI_LOG << "Assert sum of unigram probs equal to 1..."; + BaseFloat prob_sum = 0.0; + int32 count = 0; + for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) { + HistType h(it1->first); + for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) { + prob_sum += 1.0 * pow(10.0, (it2->second).first); + count++; + } + } + KALDI_LOG << "Number of total words: " << count; + KALDI_LOG << "Sum of unigram probs equal to " << prob_sum; + + KALDI_LOG << "Assert sum of bigram probs given a history equal to 1..."; + prob_sum = 0.0; + auto it1 = probs_[1].begin(); + HistType h(it1->first); + for (int32 i = 0; i < num_words_; i++) { + auto it2 = probs_[1][h].find(vocab_[i].second); + if (it2 != probs_[1][h].end()) { + prob_sum += 1.0 * pow(10, (it2->second).first); + } else { + prob_sum += pow(10, GetProb(2, vocab_[i].second, h)); + } + } + KALDI_LOG << "Sum of bigram probs given a history equal to " << prob_sum; +} + +// Read sentences from a file +void ArpaSampling::ReadSentences(std::istream &iss, std::vector >* sentences) { + const fst::SymbolTable* sym = Symbols(); + std::string line; + KALDI_LOG << "Start reading sentences..."; + while (getline(iss, line)) { + std::istringstream is(line); + std::istream_iterator begin(is), end; + std::vector tokens(begin, end); + std::vector sentence; + int32 word; + int32 bos = sym->Find(bos_symbol_); + sentence.push_back(bos); + for (int32 i = 0; i < tokens.size(); i++) { + word = sym->Find(tokens[i]); + if (word == fst::SymbolTable::kNoSymbol) { + word = sym->Find(unk_symbol_); + } + sentence.push_back(word); + } + int32 eos = sym->Find(eos_symbol_); + sentence.push_back(eos); + (*sentences).push_back(sentence); + } + KALDI_LOG << "Finished reading sentences."; +} + +// Read histories of integers from a file +void ArpaSampling::ReadHistories(std::istream &is, bool binary) { + if (binary) { + KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; + } + const fst::SymbolTable* sym = Symbols(); + std::string line; + KALDI_LOG << "Start reading histories..."; + while (getline(is, line)) { + std::istringstream is(line); + std::istream_iterator begin(is), end; + std::vector tokens(begin, end); + HistType history; + int32 word; + for (int32 i = 0; i < tokens.size(); i++) { + word = sym->Find(tokens[i]); + if (word == fst::SymbolTable::kNoSymbol) { + word = sym->Find(unk_symbol_); + } + history.push_back(word); + } + if (history.size() >= ngram_order_) { + std::reverse(history.begin(), history.end()); + history.resize(ngram_order_ - 1); + std::reverse(history.begin(), history.end()); + } + histories_.push_back(history); + } + KALDI_LOG << "Finished reading histories."; +} + +} // end of kaldi diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h new file mode 100644 index 00000000000..bad3b08953f --- /dev/null +++ b/src/rnnlm/arpa-sampling.h @@ -0,0 +1,182 @@ +// arpa_sampling.h + +// Copyright 2016 Ke Li + +// See ../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABILITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#ifndef ARPA_SAMPLING_H_ +#define ARPA_SAMPLING_H_ + +#include +#include +#include "lm/arpa-file-parser.h" +#include "fst/fstlib.h" + +#ifdef _MSC_VER +#include +#include +using std::unordered_map; +using std::unordered_set; +#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) +#include +#include +using std::unordered_map; +using std::unordered_set; +#else +#include +#include +using std::tr1::unordered_map; +using std::tr1::unordered_set; +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace kaldi { + +typedef int32_t int32; + +/// A hashing function-object for vectors of ints. +struct IntVectorHasher { // hashing function for vector. + size_t operator()(const std::vector &x) const { + size_t ans = 0; + typename std::vector::const_iterator iter = x.begin(), end = x.end(); + for (; iter != end; ++iter) { + ans *= kPrime; + ans += *iter; + } + return ans; + } + private: + static const int kPrime = 7853; +}; + +// Predefine some symbol values, because any integer is as good than any other. +enum { + kEps = 0, + kDisambig, + kBos, kEos, kUnk +}; + +typedef std::vector HistType; +typedef unordered_map > WordToProbsMap; +typedef unordered_map NgramType; +typedef unordered_map HistWeightsType; + +class ArpaSampling : public ArpaFileParser { + public: + // constructor + explicit ArpaSampling(ArpaParseOptions options, fst::SymbolTable* symbols) + : ArpaFileParser(options, symbols) { + ngram_order_ = 0; + num_words_ = 0; + bos_symbol_ = ""; + eos_symbol_ = ""; + unk_symbol_ = ""; + } + // Compute the probability of a given sentence with ngram_order LM + BaseFloat ComputeSentenceProb(const std::vector& test_sentence); + + // Test the read-in model by computing probs of all sentences with ngram_order LM + BaseFloat ComputeAllSentencesProb(const std::vector >& test_sentences); + + void TestReadingModel(); + + void TestProbs(std::istream &is, bool binary); + + void TestSampling(); + + // print history + void PrintHist(const HistType& h); + + void ReadHistories(std::istream &is, bool binary); + + void ReadSentences(std::istream &is, std::vector >* sentences); + + protected: + // ArpaFileParser overrides. + virtual void HeaderAvailable(); + virtual void ConsumeNGram(const NGram& ngram); + virtual void ReadComplete() {} + + private: + // This function returns the log probability of a ngram term from the ARPA LM + // if it is found; it backoffs to the lower order model when the ngram term + // does not exist. + BaseFloat GetProb(int32 order, int32 word, const HistType& history); + + // Get the back-off weight of a ngram in the read-in model + BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history); + + // Compute a pdf of words in the vocab given a history + void ComputeWordPdf(const HistType& history, std::vector >* pdf); + + // Compute weights of given histories + void ComputeHistoriesWeights(); + + // Compute weighted pdf given all histories + void ComputeWeightedPdf(std::vector >* weighted_pdf); + + // Sample the next word + int32 SampleWord(const std::vector >& pdf); + + // N-gram order of the read-in LM. + int32 ngram_order_; + + // num_words + int32 num_words_; + + // Bos symbol + std::string bos_symbol_; + + // Eos symbol + std::string eos_symbol_; + + // Unk symbol + std::string unk_symbol_; + + // Vocab + std::vector > vocab_; + + // Counts of each ngram + std::vector ngram_counts_; + + // N-gram probabilities. + std::vector probs_; + + // Histories' weights + HistWeightsType hists_weights_; + + // The given N Histories + std::vector histories_; + + // Test sentences + std::vector > sentences_; +}; + +} // end of namespace kaldi +#endif diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc index 793b9d3a498..5a1466b68f5 100644 --- a/src/rnnlm/rnnlm-utils-test.cc +++ b/src/rnnlm/rnnlm-utils-test.cc @@ -1,7 +1,13 @@ // rnnlm/rnnlm-utils-test.cc -#include #include "rnnlm/rnnlm-utils.h" +#include "arpa-sampling.h" + +#include +#include +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fst/fstlib.h" namespace kaldi { namespace rnnlm { @@ -168,12 +174,38 @@ void UnitTestSamplingTime(int iters) { } // end namespace rnnlm } // end namespace kaldi. -int main() { +int main(int argc, char **argv) { using namespace kaldi; using namespace rnnlm; int N = 10000; UnitTestSampleWithProbOne(N); UnitTestSamplingTime(N); UnitTestSamplingConvergence(); -} + const char *usage = ""; + ParseOptions po(usage); + po.Read(argc, argv); + std::string arpa_file = po.GetArg(1), history_file = po.GetArg(2); + + ArpaParseOptions options; + fst::SymbolTable symbols; + // Use spaces on special symbols, so we rather fail than read them by mistake. + symbols.AddSymbol(" ", kEps); + symbols.AddSymbol(" #0", kDisambig); + options.bos_symbol = symbols.AddSymbol("", kBos); + options.eos_symbol = symbols.AddSymbol("", kEos); + options.unk_symbol = symbols.AddSymbol("", kUnk); + options.oov_handling = ArpaParseOptions::kAddToSymbols; + ArpaSampling mdl(options, &symbols); + + bool binary; + Input k1(arpa_file, &binary); + mdl.Read(k1.Stream(), binary); + mdl.TestReadingModel(); + + Input k2(history_file, &binary); + mdl.ReadHistories(k2.Stream(), binary); + + mdl.TestSampling(); + return 0; +} From 04996e8c0fe0d072742e50f13b08ea5c7724e687 Mon Sep 17 00:00:00 2001 From: Ke Li Date: Tue, 28 Mar 2017 19:19:23 -0400 Subject: [PATCH 3/5] fix a bug in computing weights of histories --- src/rnnlm/arpa-sampling.cc | 42 ++++++++++++++++++++++++++++++++++- src/rnnlm/arpa-sampling.h | 2 ++ src/rnnlm/rnnlm-utils-test.cc | 5 ++++- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc index f512d11fbcb..c771aa68edf 100644 --- a/src/rnnlm/arpa-sampling.cc +++ b/src/rnnlm/arpa-sampling.cc @@ -76,6 +76,7 @@ BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector >* pdf) { int32 order = history.size(); BaseFloat prob = 0.0; + (*pdf).resize(num_words_); // if do not do this, (*pdf)[word] += prob will get seg fault for (int32 i = 0; i < num_words_; i++) { auto it = probs_[order].find(history); int32 word = vocab_[i].second; // get word from the map @@ -126,7 +127,8 @@ void ArpaSampling::ComputeHistoriesWeights() { HistType h(h_tmp.begin(), last); int32 word = h_tmp.back(); prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h)); - h_tmp = h; + HistType h_up(h_tmp.begin() + 1, h_tmp.end()); + h_tmp = h_up; } HistType::iterator begin = history.begin() + i; HistType h(begin, history.end()); @@ -244,6 +246,44 @@ void ArpaSampling::TestSampling() { KALDI_LOG << "Number of words OOV : " << count_nons; } +// this function check the estimated pdfs from 1) weighted history and 2) normal computation +// are the same +void ArpaSampling::TestPdfsEqual() { + // get the weighted pdf + ComputeHistoriesWeights(); + std::vector > pdf_hist_weight; + ComputeWeightedPdf(&pdf_hist_weight); + // check the averaged pdf sums to 1 + BaseFloat sum = 0; + for (int32 i = 0; i < num_words_; i++) { + sum += pdf_hist_weight[i].second; + } + KALDI_LOG << "Sum of weighted pfd: " << sum; + // get the average pdf + std::vector > pdf; + pdf.resize(num_words_); + for (int32 i = 0; i < histories_.size(); i++) { + std::vector > pdf_h; + ComputeWordPdf(histories_[i], &pdf_h); + for(int32 j = 0; j < pdf_h.size(); j++) { + pdf[j].first = pdf_h[j].first; + pdf[j].second += pdf_h[j].second / histories_.size(); + } + } + // check the averaged pdf sums to 1 + sum = 0; + for (int32 i = 0; i < num_words_; i++) { + sum += pdf[i].second; + } + KALDI_LOG << "Sum of averaged pdf: " << sum; + // check equality of the two pdfs + BaseFloat diff = 0; + for (int32 i = 0; i < num_words_; i++) { + diff += abs(pdf_hist_weight[i].second - pdf[i].second); + } + KALDI_LOG << " diff of the two pdfs: " << diff; + +} // this function returns the log probability of the given sentence BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector& sentence) { BaseFloat prob = 0; diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h index bad3b08953f..699dbf09d69 100644 --- a/src/rnnlm/arpa-sampling.h +++ b/src/rnnlm/arpa-sampling.h @@ -109,6 +109,8 @@ class ArpaSampling : public ArpaFileParser { void TestProbs(std::istream &is, bool binary); void TestSampling(); + + void TestPdfsEqual(); // print history void PrintHist(const HistType& h); diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc index 5a1466b68f5..a4f2b7abd79 100644 --- a/src/rnnlm/rnnlm-utils-test.cc +++ b/src/rnnlm/rnnlm-utils-test.cc @@ -205,7 +205,10 @@ int main(int argc, char **argv) { Input k2(history_file, &binary); mdl.ReadHistories(k2.Stream(), binary); - + // command for running the test binary: ./test-binary arpa-file history-file + // arpa-file is the ARPA-format language model + // history-file has lines of histories, one history per line + mdl.TestPdfsEqual(); mdl.TestSampling(); return 0; } From 62e5f9b2c641a692eaa7b36896ed5b8e54643f4d Mon Sep 17 00:00:00 2001 From: Ke Li Date: Thu, 6 Apr 2017 02:10:00 -0400 Subject: [PATCH 4/5] Add history-weight test --- src/rnnlm/arpa-sampling.cc | 111 +++------ src/rnnlm/arpa-sampling.h | 31 +-- src/rnnlm/rnnlm-utils-test.cc | 11 +- src/rnnlm/sample_a_word.cc | 432 ---------------------------------- src/rnnlm/sample_a_word.h | 159 ------------- 5 files changed, 43 insertions(+), 701 deletions(-) delete mode 100644 src/rnnlm/sample_a_word.cc delete mode 100644 src/rnnlm/sample_a_word.h diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc index c771aa68edf..d7be9ea7f3b 100644 --- a/src/rnnlm/arpa-sampling.cc +++ b/src/rnnlm/arpa-sampling.cc @@ -135,12 +135,12 @@ void ArpaSampling::ComputeHistoriesWeights() { hists_weights_[h] += prob; } } - KALDI_LOG << "Size of hists_weights_ is: " << hists_weights_.size(); } // Get weighted pdf void ArpaSampling::ComputeWeightedPdf(std::vector >* pdf_w) { BaseFloat prob = 0; + (*pdf_w).clear(); (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault for (int32 i = 0; i < num_words_; i++) { for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) { @@ -172,84 +172,31 @@ void ArpaSampling::ComputeWeightedPdf(std::vector >* } // end reading words } -// sample a word that follows a pdf -int32 ArpaSampling::SampleWord(const std::vector >& pdf) { - // generate a cdf from the given pdf - std::vector > cdf; - BaseFloat upper = 0; - int32 word; - std::pair probs; - for (int32 i = 0; i < num_words_; i++) { - upper += pdf[i].second; - word = vocab_[i].second; - probs = std::make_pair(word, upper); - cdf.push_back(probs); - } - BaseFloat u = 1.0 * RandUniform(); - if (u >= 0 && u < cdf[1].second) { - return cdf[0].first; - } - for (int32 i = 1; i < num_words_; i++) { - if (cdf[i - 1].second <= u && u < cdf[i].second) { - return cdf[i].first; +void ArpaSampling::RandomGenerateHistories() { + // clear previous histories + histories_.clear(); + // randomly generate histories + int32 num_histories = rand() % 1000 + 5; // generate at least 5 histories + for (int32 i = 0; i < num_histories; i++) { + HistType hist; + // size of history should be in {1, 2, ..., ngram_order_} + int32 size_hist = rand() % (ngram_order_ - 1) + 1; + KALDI_ASSERT(size_hist <= ngram_order_); + for (int32 j = 0; j < size_hist; j++) { + // word can not be zero since zero represents epsilon in the fst symbol format + int32 word = rand() % (vocab_.size() - 1) + 1; + KALDI_ASSERT(word > 0 && word <= vocab_.size()); + hist.push_back(word); } + histories_.push_back(hist); } - return -1; } -// Sample a word -void ArpaSampling::TestSampling() { - ComputeHistoriesWeights(); - std::vector > pdf; - ComputeWeightedPdf(&pdf); - BaseFloat sum = 0; - for (int32 i = 0; i < num_words_; i++) { - sum += pdf[i].second; - } - - // Check convergence - unordered_map pdf_est; - int32 word; - int32 count_nons = 0; - int32 count = 0; - for (int32 i = 0; ; i++) { - word = SampleWord(pdf); - if (word > num_words_ || word < 0) { - KALDI_LOG << "the next word is " << word; - count_nons += 1; - continue; - } else { - auto it = pdf_est.find(word); - if (it == pdf_est.end()) { - pdf_est.insert({word, 1.0}); - } else { - pdf_est[word] += 1.0; - } - } - count++; - if (count % 1000 == 0) { - // normalization - BaseFloat ed = 0; - for (int32 i = 0; i < num_words_; i++) { - int32 word = vocab_[i].second; - pdf_est[word] /= count; - ed += pow(pdf_est[word] - pdf[i].second, 2); - } - ed = pow(ed, 0.5); - // KALDI_LOG << "Run " << count << " times, Euclidean distance is " << ed; - if (ed <= 0.05) { - KALDI_LOG << "Run " << count << " times, Euclidean distance (expect <= 0.05) is " << ed; - break; - } - } - } - KALDI_LOG << "Number of words OOV : " << count_nons; -} - -// this function check the estimated pdfs from 1) weighted history and 2) normal computation -// are the same +// this function checks the two estimated pdfs from 1) weighted history +// and 2) normal computation are the same void ArpaSampling::TestPdfsEqual() { - // get the weighted pdf + RandomGenerateHistories(); + hists_weights_.clear(); ComputeHistoriesWeights(); std::vector > pdf_hist_weight; ComputeWeightedPdf(&pdf_hist_weight); @@ -258,7 +205,7 @@ void ArpaSampling::TestPdfsEqual() { for (int32 i = 0; i < num_words_; i++) { sum += pdf_hist_weight[i].second; } - KALDI_LOG << "Sum of weighted pfd: " << sum; + KALDI_ASSERT(ApproxEqual(sum, 1.0)); // get the average pdf std::vector > pdf; pdf.resize(num_words_); @@ -275,15 +222,15 @@ void ArpaSampling::TestPdfsEqual() { for (int32 i = 0; i < num_words_; i++) { sum += pdf[i].second; } - KALDI_LOG << "Sum of averaged pdf: " << sum; + KALDI_ASSERT(ApproxEqual(sum, 1.0)); // check equality of the two pdfs BaseFloat diff = 0; for (int32 i = 0; i < num_words_; i++) { diff += abs(pdf_hist_weight[i].second - pdf[i].second); } - KALDI_LOG << " diff of the two pdfs: " << diff; - + KALDI_ASSERT(ApproxEqual(diff, 0.0)); } + // this function returns the log probability of the given sentence BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector& sentence) { BaseFloat prob = 0; @@ -339,6 +286,10 @@ void ArpaSampling::TestProbs(std::istream &is, bool binary) { void ArpaSampling::TestReadingModel() { KALDI_LOG << "Testing model reading part..."<< std::endl; KALDI_LOG << "Vocab size is: " << vocab_.size(); + std::cout << "Print out vocab: " << std::endl; + for (int i = 0; i < vocab_.size(); i++) { + std::cout << i << " , " << vocab_[i].first << " , " << vocab_[i].second << std::endl; + } KALDI_LOG << "Ngram_order is: " << ngram_order_; KALDI_ASSERT(probs_.size() == ngram_counts_.size()); for (int32 i = 0; i < ngram_order_; i++) { @@ -415,7 +366,7 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) { } const fst::SymbolTable* sym = Symbols(); std::string line; - KALDI_LOG << "Start reading histories..."; + KALDI_LOG << "Start reading histories from file..."; while (getline(is, line)) { std::istringstream is(line); std::istream_iterator begin(is), end; @@ -436,7 +387,7 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) { } histories_.push_back(history); } - KALDI_LOG << "Finished reading histories."; + KALDI_LOG << "Finished reading histories from file."; } } // end of kaldi diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h index 699dbf09d69..1fdedeb573a 100644 --- a/src/rnnlm/arpa-sampling.h +++ b/src/rnnlm/arpa-sampling.h @@ -25,36 +25,15 @@ #include "lm/arpa-file-parser.h" #include "fst/fstlib.h" -#ifdef _MSC_VER -#include -#include -using std::unordered_map; -using std::unordered_set; -#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) -#include -#include -using std::unordered_map; -using std::unordered_set; -#else -#include -#include -using std::tr1::unordered_map; -using std::tr1::unordered_set; -#endif - -#include #include #include #include #include -#include #include #include #include #include #include -#include -#include namespace kaldi { @@ -78,7 +57,7 @@ struct IntVectorHasher { // hashing function for vector. // Predefine some symbol values, because any integer is as good than any other. enum { kEps = 0, - kDisambig, + // kDisambig, kBos, kEos, kUnk }; @@ -108,8 +87,6 @@ class ArpaSampling : public ArpaFileParser { void TestProbs(std::istream &is, bool binary); - void TestSampling(); - void TestPdfsEqual(); // print history @@ -133,6 +110,9 @@ class ArpaSampling : public ArpaFileParser { // Get the back-off weight of a ngram in the read-in model BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history); + + // For test: randomly generate histories + void RandomGenerateHistories(); // Compute a pdf of words in the vocab given a history void ComputeWordPdf(const HistType& history, std::vector >* pdf); @@ -143,9 +123,6 @@ class ArpaSampling : public ArpaFileParser { // Compute weighted pdf given all histories void ComputeWeightedPdf(std::vector >* weighted_pdf); - // Sample the next word - int32 SampleWord(const std::vector >& pdf); - // N-gram order of the read-in LM. int32 ngram_order_; diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc index a4f2b7abd79..0f5673b2035 100644 --- a/src/rnnlm/rnnlm-utils-test.cc +++ b/src/rnnlm/rnnlm-utils-test.cc @@ -191,7 +191,7 @@ int main(int argc, char **argv) { fst::SymbolTable symbols; // Use spaces on special symbols, so we rather fail than read them by mistake. symbols.AddSymbol(" ", kEps); - symbols.AddSymbol(" #0", kDisambig); + // symbols.AddSymbol(" #0", kDisambig); options.bos_symbol = symbols.AddSymbol("", kBos); options.eos_symbol = symbols.AddSymbol("", kEos); options.unk_symbol = symbols.AddSymbol("", kUnk); @@ -208,7 +208,12 @@ int main(int argc, char **argv) { // command for running the test binary: ./test-binary arpa-file history-file // arpa-file is the ARPA-format language model // history-file has lines of histories, one history per line - mdl.TestPdfsEqual(); - mdl.TestSampling(); + + // this test can be slow + KALDI_LOG << "Start weighted histories test..."; + for (int i = 0; i < N / 100; i++) { + mdl.TestPdfsEqual(); + } + KALDI_LOG << "Successfuly pass the test."; return 0; } diff --git a/src/rnnlm/sample_a_word.cc b/src/rnnlm/sample_a_word.cc deleted file mode 100644 index 8761291d525..00000000000 --- a/src/rnnlm/sample_a_word.cc +++ /dev/null @@ -1,432 +0,0 @@ -// sample_a_word.cc - -#include "sample_a_word.h" -#include -#include -#include -#include -#include - -// Constructor for sampling the next word -NgramModel::NgramModel(char* arpa_file, char* histories_file) { - vocab_size_ = 0; - ReadARPAModel(arpa_file); - ReadHistories(histories_file); -} - -// Read language model from a ARPA-format file. -void NgramModel::ReadARPAModel(char* file) { - std::ifstream data_input(file); - if (!data_input.is_open()) { - std::cerr << "error opening '" << file - << "' for reading\n"; - exit(1); - } - std::string line; - int32 order; - int32 order_current = 0; - int32 word; - int32 iter = 0; - int32 while_iter = 0; - std::pair probs_pair; - float log_prob; - float backoff_weight; - bool unigram_check = false; - std::cout << "Start reading ARPA-format file..." << std::endl; - while (getline(data_input, line)) { - std::istringstream is(line); - // get the strings splitted by single space - // brace-initialization with C++11 - std::istream_iterator begin(is), end; - std::vector tokens(begin, end); - if (tokens.size() == 0) continue; - if (tokens.size() == 2 && tokens[0] == "ngram") { - std::string substring = tokens[1].substr(2); - int32 count = std::stoi(substring); // get "123456" from "1=123456" - counts_.push_back(count); - order = std::stoi(tokens[1].substr(0)); - continue; - } - if (tokens.size() == 1 && tokens[0] == "\\1-grams:") { - ngram_order_ = order; // ngram_order - probs_.resize(ngram_order_); - std::cout << "Ngram order is: " << ngram_order_ << std::endl; - } - // read current order - if (tokens.size() == 1 && tokens[0] != "\\data\\" && - tokens[0] != "\\end\\") { - order_current = std::stoi(tokens[0].substr(1,1)); - continue; // get the order info and skip processing this line - } - // read vocab and initialize probs of unigrams - if (order_current == 1) { - std::string word_s; - if (tokens.back() != "") { - word_s = tokens.end()[-2]; - backoff_weight = std::stof(tokens.back()); - } else { - word_s = tokens.back(); - backoff_weight = 0; - } - word = iter; - vocab_.insert({word_s, word}); - iter++; - vocab_size_++; - if (iter == counts_[0]) { - bool unigram_check = true; - std::cout << "vocab size: " << vocab_size_ << std::endl; - } - HistType history; - history.resize(0); - log_prob = std::stof(tokens[0]); - probs_pair = std::make_pair(log_prob, backoff_weight); - probs_[order_current - 1][history].insert({word, probs_pair}); - continue; - } - // read each ngram and its log-probs and back-off weights - // read probs of order 1 to N - 1 - if (order_current < ngram_order_ && order_current > 1) { - // case1: backoff_weights exist - if ((tokens.size() > order_current + 1) && (tokens.back() != "") && tokens[0] != "ngram") { - // get the integer for word, the last second string in tokens - std::string second_last = tokens.end()[-2]; - unordered_map::iterator it = vocab_.find(second_last); - if (it != vocab_.end()) { - word = it->second; - } else { - std::cout << "OOV word found: " << tokens.end()[-2] << std::endl; - } - int32 len_hist = tokens.size() - 3; // exclude the word, log-prob, and bow - HistType history; - for (int32 i = 1; i < len_hist + 1; i++) { - unordered_map::iterator it = vocab_.find(tokens[i]); - if (it != vocab_.end()) { - history.push_back(it->second); - } else { - std::cout << "OOV found in history: " << tokens[i] << std::endl; - } - } - assert (history.size() == order_current - 1); - log_prob = std::stof(tokens[0]); - backoff_weight = std::stof(tokens.back()); - probs_pair = std::make_pair(log_prob, backoff_weight); - probs_[order_current - 1][history].insert({word, probs_pair}); - continue; - } - // case2: no backoff_weights - if (tokens.size() == order_current + 1 && (tokens.back() == "") && tokens[0] != "ngram") { - unordered_map::iterator it = vocab_.find(tokens.back()); - if (it != vocab_.end()) { - word = it->second; - } - int32 len_hist = tokens.size() - 2; // exclude the word and log-prob - HistType history; - assert (len_hist > 0); - for (int32 i = 1; i < len_hist + 1; i++) { - unordered_map::iterator it = vocab_.find(tokens[i]); - if (it != vocab_.end()) { - history.push_back(it->second); - } else { - std::cout << "OOV found in history: " << tokens[i] << std::endl; - } - } - assert (history.size() == order_current - 1); - log_prob = std::stof(tokens[0]); - backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff) - probs_pair = std::make_pair(log_prob, backoff_weight); - probs_[order_current - 1][history].insert({word, probs_pair}); - continue; - } - } else if (order_current == ngram_order_) { // read probs of order N - if (tokens.size() > 2) { - std::string word_s = tokens.back(); - unordered_map::iterator it = vocab_.find(word_s); - if (it != vocab_.end()) { - word = it->second; - } - int32 len_hist = tokens.size() - 2; // exclude the word and log-prob - HistType history; - assert (len_hist > 0); - for (int32 i = 1; i < len_hist + 1; i++) { - unordered_map::iterator it = vocab_.find(tokens[i]); - if (it != vocab_.end()) { - history.push_back(it->second); - } else { - std::cout << "OOV found in history: " << tokens[i] << std::endl; - } - } - log_prob = std::stof(tokens[0]); - backoff_weight = 0; // backoff_weight in log space should be 1 (no backoff) - probs_pair = std::make_pair(log_prob, backoff_weight); - probs_[order_current - 1][history].insert({word, probs_pair}); - continue; - } - } - } - std::cout << "Finish reading ARPA-format file." << std::endl; -} - -float NgramModel::GetProb(int32 order, const int32 word, const HistType& history) { - float prob = 0.0; - auto it = probs_[order - 1].find(history); - if (it != probs_[order - 1].end() && - probs_[order-1][history].find(word) != probs_[order-1][history].end()) { - prob += probs_[order-1][history][word].first; - } else { // backoff to the previous order - order--; - if (order >= 1) { - HistType::const_iterator first = history.begin() + 1; - HistType::const_iterator last = history.end(); - HistType h(first, last); - prob += GetProb(order, word, h); - int32 word_new = history.back(); - HistType::const_iterator last_new = history.end() - 1; - HistType h_new(history.begin(), last_new); - prob += GetBackoffWeight(order, word_new, h_new); - } - } - return prob; -} - -float NgramModel::GetBackoffWeight(int32 order, const int32 word, const HistType& history) { - float bow = 0.0; - auto it = probs_[order - 1].find(history); - if (it != probs_[order - 1].end()) { - auto it2 = probs_[order - 1][history].find(word); - if (it2 != probs_[order - 1][history].end()) { - bow = (it2->second).second; - } - } - return bow; -} - -void NgramModel::ComputeWordPdf(const HistType& history, std::vector* pdf) { - int32 order = history.size(); - float prob = 0.0; - for (int32 i = 0; i < vocab_size_; i++) { - auto it = probs_[order].find(history); - int32 word = i; - if (it != probs_[order].end()) { - auto it2 = probs_[order][history].find(word); - if (it2 != probs_[order][history].end()) { - prob = pow(10, (it2->second).first); - (*pdf).push_back(prob); - } else { - HistType::const_iterator first = history.begin() + 1; - HistType::const_iterator last = history.end(); - HistType h(first, last); - int32 word_new = history.back(); - HistType::const_iterator last_new = history.end() - 1; - HistType h_new(history.begin(), last_new); - prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * - pow(10, GetProb(order, word, h)); - (*pdf).push_back(prob); - } - } else { - HistType::const_iterator first = history.begin() + 1; - HistType::const_iterator last = history.end(); - HistType h(first, last); - int32 word_new = history.back(); - HistType::const_iterator last_new = history.end() - 1; - HistType h_new(history.begin(), last_new); - prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * - pow(10, GetProb(order, word, h)); - (*pdf).push_back(prob); - } - } -} - -// Get history weights -void NgramModel::ComputeHistoriesWeights() { - for (auto it = histories_.begin(); it != histories_.end(); ++it) { - HistType history(*(it)); - assert(history.size() <= ngram_order_); - for (int32 i = 0; i < history.size() + 1; i++) { - HistType h_tmp = history; - float prob = 1.0 / histories_.size(); - while (h_tmp.size() > (history.size() - i)) { - HistType::iterator last = h_tmp.end() - 1; - HistType h(h_tmp.begin(), last); - int32 word = h_tmp.back(); - prob *= pow(10, GetBackoffWeight(h_tmp.size(), word, h)); - h_tmp = h; - } - HistType::iterator begin = history.begin() + i; - HistType h(begin, history.end()); - hists_weights_[h] += prob; - } - } - std::cout << "Size of hists_weights_ is: " << hists_weights_.size() << std::endl; -} - -// Get weighted pdf -void NgramModel::ComputeWeightedPdf(std::vector* pdf_w) { - float prob = 0; - (*pdf_w).resize(vocab_size_); // if do not do this, (*pdf_w)[word] += prob will get seg fault - for (int32 i = 0; i < vocab_size_; i++) { - for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) { - HistType h(it->first); - int32 order = h.size(); - auto it_hist = probs_[order].find(h); - if (it_hist != probs_[order].end()) { - int32 word = i; - auto it_word = probs_[order][h].find(word); - if (it_word != probs_[order][h].end()) { - if (order > 0) { - HistType::iterator last = h.end() - 1; - HistType::iterator first = h.begin() + 1; - HistType h1(h.begin(), last); - HistType h2(first, h.end()); - prob = (it->second) * (pow(10, probs_[order][h][word].first) - - pow(10, GetBackoffWeight(order, h.back(), h1)) - * pow(10, GetProb(order, word, h2))); - (*pdf_w)[word] += prob; - } - else { - prob = (it->second) * pow(10, probs_[order][h][word].first); - (*pdf_w)[word] += prob; - } - } - } - } // end reading history - } // end reading words -} - -// sample a word that follows a pdf -int32 NgramModel::SampleWord(const std::vector& pdf) { - // generate a cdf from the given pdf - std::vector > cdf; - float upper = 0; - float lower = 0; - std::pair probs; - for (int32 i = 0; i < pdf.size(); i++) { - upper += pdf[i]; - lower = upper - pdf[i]; - probs = std::make_pair(lower, upper); - cdf.push_back(probs); - } - float u = 1.0 * rand()/RAND_MAX; - for (int32 i = 0; i < cdf.size(); i++) { - if (cdf[i].first <= u < cdf[i].second) { - return i; - } - } -} - -// Sampling a word -void NgramModel::TestSampling(int32 iters) { - ComputeHistoriesWeights(); - std::vector pdf; - ComputeWeightedPdf(&pdf); - - // Compute diff - std::vector pdf_est; - pdf_est.resize(vocab_size_); - int32 word; - int32 count_nons = 0; - for (int32 i = 0; i < iters; i++) { - word = SampleWord(pdf); - if (word > vocab_size_ || word < 0) { - std::cout << "the next word is " << word << std::endl; - count_nons += 1; - continue; - } else { - pdf_est[word] += 1.0; - } - } - // normalization - float ed = 0; - for (int32 i = 0; i < vocab_size_; i++) { - pdf_est[word] /= iters; - ed += pow(pdf_est[word] - pdf[word], 2); - } - ed = pow(ed, 0.5); - std::cout << "Run " << iters << " times, e distance (expect < 0.05) is " << ed << std::endl; - std::cout << "Number of words OOV : " << count_nons << std::endl; -} - -// Test the read-in language model -void NgramModel::TestReadingModel() { - std::cout << "Testing model reading part..."<< std::endl; - std::cout << "Vocab size is: " << vocab_size_ << std::endl; - std::cout << "Ngram_order is: " << ngram_order_ << std::endl; - assert(probs_.size() == counts_.size()); - for (int32 i = 0; i < ngram_order_; i++) { - int32 size_ngrams = 0; - std::cout << "Test: for order " << (i + 1) << std::endl; - std::cout << "Expected number of " << (i + 1) << "-grams: " << counts_[i] << std::endl; - for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { - HistType h(it1->first); - for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) { - size_ngrams++; // number of words given - } - } - std::cout << "Read in number of " << (i + 1) << "-grams: " << size_ngrams << std::endl; - } - std::cout << "Assert sum of unigram probs equal to 1..." << std::endl; - float prob_sum = 0.0; - int32 count = 0; - for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) { - HistType h(it1->first); - for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) { - prob_sum += 1.0 * pow(10.0, (it2->second).first); - count++; - } - } - std::cout << "Number of total words: " << count << std::endl; - std::cout << "Sum of unigram probs equal to " << prob_sum << std::endl; - - std::cout << "Assert sum of bigram probs given a history equal to 1..." << std::endl; - prob_sum = 0.0; - auto it1 = probs_[1].begin(); - HistType h(it1->first); - for (auto it = vocab_.begin(); it != vocab_.end(); ++it) { - auto it2 = probs_[1][h].find(it->second); - if (it2 != probs_[1][h].end()) { - prob_sum += 1.0 * pow(10, (it2->second).first); - } else { - prob_sum += pow(10, GetProb(2, it->second, h)); - } - } - std::cout << "Sum of bigram probs given a history equal to " << prob_sum << std::endl; - -} - -// Read histories of integers from a file -void NgramModel::ReadHistories(char* file) { - std::ifstream data_input(file); - if (!data_input.is_open()) { - std::cerr << "error opening '" << file - << "' for reading\n"; - exit(1); - } - std::string line; - std::cout << "Start reading histories..." << std::endl; - while (getline(data_input, line)) { - std::istringstream is(line); - std::istream_iterator begin(is), end; - std::vector tokens(begin, end); - HistType history; - int32 word; - for (int32 i = 0; i < tokens.size(); i++) { - auto it = vocab_.find(tokens[i]); - if (it != vocab_.end()) { - word = it->second; - } else { - std::string word_s = ""; - auto it_unk = vocab_.find(word_s); - assert (it_unk != vocab_.end()); - word = it_unk->second; - } - history.push_back(word); - } - if (history.size() >= ngram_order_) { - // TODO: try slicing it later - std::reverse(history.begin(), history.end()); - history.resize(ngram_order_ - 1); - std::reverse(history.begin(), history.end()); - } - histories_.push_back(history); - } - std::cout << "Finished reading histories." << std::endl; -} diff --git a/src/rnnlm/sample_a_word.h b/src/rnnlm/sample_a_word.h deleted file mode 100644 index 86fa4e1d4ee..00000000000 --- a/src/rnnlm/sample_a_word.h +++ /dev/null @@ -1,159 +0,0 @@ -// sample_a_word.h - -// Copyright 2016 Ke Li - -// See ../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABILITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#ifndef SAMPLE_A_WORD_H_ -#define SAMPLE_A_WORD_H_ - -#include -#include - -#ifdef _MSC_VER -#include -#include -using std::unordered_map; -using std::unordered_set; -#elif __cplusplus > 199711L || defined(__GXX_EXPERIMENTAL_CXX0X__) -#include -#include -using std::unordered_map; -using std::unordered_set; -#else -#include -#include -using std::tr1::unordered_map; -using std::tr1::unordered_set; -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -typedef int32_t int32; - -/// A hashing function-object for vectors of ints. -struct IntVectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - private: - static const int kPrime = 7853; -}; - -typedef std::vector HistType; -typedef unordered_map > WordToProbsMap; -typedef unordered_map NgramType; -typedef unordered_map HistWeightsType; - -class Timer { - public: - Timer() { Reset(); } - - void Reset() { gettimeofday(&this->time_start_, &time_zone_); } - - /// Returns time in seconds. - double Elapsed() { - struct timeval time_end; - gettimeofday(&time_end, &time_zone_); - double t1, t2; - t1 = static_cast(time_start_.tv_sec) + - static_cast(time_start_.tv_usec)/(1000*1000); - t2 = static_cast(time_end.tv_sec) + - static_cast(time_end.tv_usec)/(1000*1000); - return t2-t1; - } - - private: - struct timeval time_start_; - struct timezone time_zone_; -}; - -class NgramModel { - public: - // Constructor for testing - NgramModel(char* arpa_file, char* histories_file); - - void TestReadingModel(); - - void TestSampling(int32 iters); - - private: - // This function returns the log probability of a ngram term from the ARPA LM - // if it is found; it backoffs to the lower order model when the ngram term - // does not exist. - float GetProb(int32 order, const int32 word, const HistType& history); - - // Get the back-off weight of a ngram in the read-in model - float GetBackoffWeight(int32 order, const int32 word, const HistType& history); - - // Compute a pdf of words in the vocab given a history - void ComputeWordPdf(const HistType& history, std::vector* pdf); - - // Compute weights of given histories - void ComputeHistoriesWeights(); - // Compute weighted pdf given all histories - void ComputeWeightedPdf(std::vector* weighted_pdf); - - // Sample the next word - int32 SampleWord(const std::vector& pdf); - - // Read the language model prob_ from stream - // Called from constructor; Check the sum of unigrams - void ReadARPAModel(char* arpa_file); - - void ReadHistories(char* file); - - // N-gram order of the read-in LM. - int32 ngram_order_; - - // Counts of each ngram - std::vector counts_; - - // Vocab size - int32 vocab_size_; - - // Vocab - unordered_map vocab_; - - // N-gram probabilities. - std::vector probs_; - - // Histories' weights - HistWeightsType hists_weights_; - - // The given N Histories - std::vector histories_; -}; - -#endif From 21034856fc578e09ef88b3a0c56ffe0b4026fd72 Mon Sep 17 00:00:00 2001 From: Ke Li Date: Fri, 28 Apr 2017 00:06:05 -0400 Subject: [PATCH 5/5] Add ComputeOutputWords function; remove auto; remove histories and hists_weights as class members --- src/rnnlm/arpa-sampling.cc | 234 +++++++++++++++------------------- src/rnnlm/arpa-sampling.h | 73 ++++------- src/rnnlm/rnnlm-utils-test.cc | 7 +- 3 files changed, 133 insertions(+), 181 deletions(-) diff --git a/src/rnnlm/arpa-sampling.cc b/src/rnnlm/arpa-sampling.cc index d7be9ea7f3b..faf380528d0 100644 --- a/src/rnnlm/arpa-sampling.cc +++ b/src/rnnlm/arpa-sampling.cc @@ -9,6 +9,7 @@ namespace kaldi { +/// this function reads each ngram line in the ARPA file void ArpaSampling::ConsumeNGram(const NGram& ngram) { int32 cur_order = ngram.words.size(); int32 word = ngram.words.back(); // word is the last word in vector words @@ -39,9 +40,13 @@ void ArpaSampling::HeaderAvailable() { probs_.resize(ngram_order_); } +// this function returns the probability of the ngram (history, word) for given +// order if the history and the word given the history exists. +// Otherwise it backoff to previous order to recursively search the lower order +// ngram until backoff to unigram. BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history) { BaseFloat prob = 0.0; - auto it = probs_[order - 1].find(history); + NgramType::const_iterator it = probs_[order - 1].find(history); if (it != probs_[order - 1].end() && probs_[order-1][history].find(word) != probs_[order-1][history].end()) { prob += probs_[order-1][history][word].first; @@ -61,29 +66,31 @@ BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history return prob; } +// this function returns the backoff weight of the ngram (history, word) BaseFloat ArpaSampling::GetBackoffWeight(int32 order, int32 word, const HistType& history) { BaseFloat bow = 0.0; - auto it = probs_[order - 1].find(history); + NgramType::const_iterator it = probs_[order - 1].find(history); if (it != probs_[order - 1].end()) { - auto it2 = probs_[order - 1][history].find(word); + WordToProbsMap::const_iterator it2 = probs_[order - 1][history].find(word); if (it2 != probs_[order - 1][history].end()) { - bow = (it2->second).second; + bow = it2->second.second; } } return bow; } +// this function computes the estimated pdf given a history void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector >* pdf) { int32 order = history.size(); BaseFloat prob = 0.0; - (*pdf).resize(num_words_); // if do not do this, (*pdf)[word] += prob will get seg fault + (*pdf).resize(num_words_); for (int32 i = 0; i < num_words_; i++) { - auto it = probs_[order].find(history); - int32 word = vocab_[i].second; // get word from the map + NgramType::const_iterator it = probs_[order].find(history); + int32 word = vocab_[i].second; if (it != probs_[order].end()) { - auto it2 = probs_[order][history].find(word); + WordToProbsMap::const_iterator it2 = probs_[order][history].find(word); if (it2 != probs_[order][history].end()) { - prob = pow(10, (it2->second).first); + prob = pow(10, it2->second.first); (*pdf)[i].first = word; (*pdf)[i].second += prob; } else { @@ -93,9 +100,7 @@ void ArpaSampling::ComputeWordPdf(const HistType& history, std::vector histories) { + HistWeightsType hists_weights; + for (std::vector::iterator it = histories.begin(); it != histories.end(); ++it) { HistType history(*(it)); KALDI_ASSERT(history.size() <= ngram_order_); for (int32 i = 0; i < history.size() + 1; i++) { HistType h_tmp = history; - BaseFloat prob = 1.0 / histories_.size(); + BaseFloat prob = 1.0 / histories.size(); while (h_tmp.size() > (history.size() - i)) { HistType::iterator last = h_tmp.end() - 1; HistType h(h_tmp.begin(), last); @@ -132,37 +138,39 @@ void ArpaSampling::ComputeHistoriesWeights() { } HistType::iterator begin = history.begin() + i; HistType h(begin, history.end()); - hists_weights_[h] += prob; + hists_weights[h] += prob; } } + return hists_weights; } -// Get weighted pdf -void ArpaSampling::ComputeWeightedPdf(std::vector >* pdf_w) { +// Get weighted pdf given a list of histories +void ArpaSampling::ComputeWeightedPdf(HistWeightsType hists_weights, + std::vector >* pdf_w) { BaseFloat prob = 0; (*pdf_w).clear(); - (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault + (*pdf_w).resize(num_words_); for (int32 i = 0; i < num_words_; i++) { - for (auto it = hists_weights_.begin(); it != hists_weights_.end(); ++it) { + for (HistWeightsType::const_iterator it = hists_weights.begin(); + it != hists_weights.end(); ++it) { HistType h(it->first); int32 order = h.size(); - auto it_hist = probs_[order].find(h); + NgramType::const_iterator it_hist = probs_[order].find(h); if (it_hist != probs_[order].end()) { int32 word = vocab_[i].second; - auto it_word = probs_[order][h].find(word); + WordToProbsMap::const_iterator it_word = probs_[order][h].find(word); if (it_word != probs_[order][h].end()) { if (order > 0) { HistType::iterator last = h.end() - 1; HistType::iterator first = h.begin() + 1; HistType h1(h.begin(), last); HistType h2(first, h.end()); - prob = (it->second) * (pow(10, probs_[order][h][word].first) - - pow(10, GetBackoffWeight(order, h.back(), h1)) - * pow(10, GetProb(order, word, h2))); + prob = it->second * (pow(10, probs_[order][h][word].first) - + pow(10, GetBackoffWeight(order, h.back(), h1) + GetProb(order, word, h2))); (*pdf_w)[i].first = word; (*pdf_w)[i].second += prob; } else { - prob = (it->second) * pow(10, probs_[order][h][word].first); + prob = it->second * pow(10, probs_[order][h][word].first); (*pdf_w)[i].first = word; (*pdf_w)[i].second += prob; } @@ -172,10 +180,42 @@ void ArpaSampling::ComputeWeightedPdf(std::vector >* } // end reading words } -void ArpaSampling::RandomGenerateHistories() { - // clear previous histories - histories_.clear(); - // randomly generate histories +// this function compute words existing for given histories and their corresponding +// probabilities +void ArpaSampling::ComputeOutputWords(std::vector histories, + unordered_map* pdf_w) { + HistWeightsType hists_weights = ComputeHistoriesWeights(histories); + BaseFloat prob = 0; + for (HistWeightsType::const_iterator it = hists_weights.begin(); it != hists_weights.end(); ++it) { + HistType h(it->first); + int32 order = h.size(); + NgramType::const_iterator it_hist = probs_[order].find(h); + if (it_hist != probs_[order].end()) { + for(WordToProbsMap::const_iterator it_word = probs_[order][h].begin(); + it_word != probs_[order][h].end(); ++it_word) { + int32 word = it_word->first; + if (order > 0) { + HistType::iterator last = h.end() - 1; + HistType::iterator first = h.begin() + 1; + HistType h1(h.begin(), last); + HistType h2(first, h.end()); + prob = it->second * (pow(10, probs_[order][h][word].first) - + pow(10, GetBackoffWeight(order, h.back(), h1) + GetProb(order, word, h2))); + unordered_map::iterator map_it = (*pdf_w).find(word); + if (map_it != (*pdf_w).end()) { + (*pdf_w)[word] += prob; + } else { + (*pdf_w).insert({word, prob}); + } + } + } + } + } +} + +// this function randomly generate 5 - 1005 histories +std::vector ArpaSampling::RandomGenerateHistories() { + std::vector histories; int32 num_histories = rand() % 1000 + 5; // generate at least 5 histories for (int32 i = 0; i < num_histories; i++) { HistType hist; @@ -188,18 +228,20 @@ void ArpaSampling::RandomGenerateHistories() { KALDI_ASSERT(word > 0 && word <= vocab_.size()); hist.push_back(word); } - histories_.push_back(hist); + histories.push_back(hist); } + return histories; } // this function checks the two estimated pdfs from 1) weighted history // and 2) normal computation are the same void ArpaSampling::TestPdfsEqual() { - RandomGenerateHistories(); - hists_weights_.clear(); - ComputeHistoriesWeights(); + std::vector histories; + histories = RandomGenerateHistories(); + HistWeightsType hists_weights; + hists_weights = ComputeHistoriesWeights(histories); std::vector > pdf_hist_weight; - ComputeWeightedPdf(&pdf_hist_weight); + ComputeWeightedPdf(hists_weights, &pdf_hist_weight); // check the averaged pdf sums to 1 BaseFloat sum = 0; for (int32 i = 0; i < num_words_; i++) { @@ -209,12 +251,12 @@ void ArpaSampling::TestPdfsEqual() { // get the average pdf std::vector > pdf; pdf.resize(num_words_); - for (int32 i = 0; i < histories_.size(); i++) { + for (int32 i = 0; i < histories.size(); i++) { std::vector > pdf_h; - ComputeWordPdf(histories_[i], &pdf_h); + ComputeWordPdf(histories[i], &pdf_h); for(int32 j = 0; j < pdf_h.size(); j++) { pdf[j].first = pdf_h[j].first; - pdf[j].second += pdf_h[j].second / histories_.size(); + pdf[j].second += pdf_h[j].second / histories.size(); } } // check the averaged pdf sums to 1 @@ -231,74 +273,19 @@ void ArpaSampling::TestPdfsEqual() { KALDI_ASSERT(ApproxEqual(diff, 0.0)); } -// this function returns the log probability of the given sentence -BaseFloat ArpaSampling::ComputeSentenceProb(const std::vector& sentence) { - BaseFloat prob = 0; - const fst::SymbolTable* sym = Symbols(); - for (int32 i = 1; i < sentence.size(); i++) { - if (i < ngram_order_ - 1) { - HistType::const_iterator last = sentence.begin() + i; - HistType h(sentence.begin(), last); - prob += GetProb(i + 1, sentence[i], h); - } else { - HistType::const_iterator first = sentence.begin() + i + 1 - ngram_order_; - HistType::const_iterator last = sentence.begin() + i; - HistType h(first, last); - KALDI_ASSERT(h.size() == ngram_order_ - 1); - prob += GetProb(ngram_order_, sentence[i], h); - } - std::string word_s = sym->Find(sentence[i]); - if (sentence[i] == kUnk) { - word_s = unk_symbol_; - } - } - return prob; -} - -// this functions computes the total log probability of all test sentences -BaseFloat ArpaSampling::ComputeAllSentencesProb(const std::vector >& sentences) { - BaseFloat prob = 0; - for (int32 i = 0; i < sentences.size(); i++) { - KALDI_ASSERT(sentences[i].size() >= 3); - prob += ComputeSentenceProb(sentences[i]); - } - int32 len = sentences.size(); - KALDI_LOG << "Total log-probabilities of " << len << " sentences are: "\ - << prob; - return prob; -} - -void ArpaSampling::PrintHist(const HistType& h) { - KALDI_LOG << "Current hist is: "; - for (int32 i = 0; i < h.size(); i++) { - KALDI_LOG << h[i] << " "; - } -} - -// Test the read-in model by computing the total prob of given sentences -void ArpaSampling::TestProbs(std::istream &is, bool binary) { - std::vector > sentences; - ReadSentences(is, &sentences); - ComputeAllSentencesProb(sentences); -} - // Test the read-in language model void ArpaSampling::TestReadingModel() { KALDI_LOG << "Testing model reading part..."<< std::endl; KALDI_LOG << "Vocab size is: " << vocab_.size(); - std::cout << "Print out vocab: " << std::endl; - for (int i = 0; i < vocab_.size(); i++) { - std::cout << i << " , " << vocab_[i].first << " , " << vocab_[i].second << std::endl; - } KALDI_LOG << "Ngram_order is: " << ngram_order_; KALDI_ASSERT(probs_.size() == ngram_counts_.size()); for (int32 i = 0; i < ngram_order_; i++) { int32 size_ngrams = 0; KALDI_LOG << "Test: for order " << (i + 1); KALDI_LOG << "Expected number of " << (i + 1) << "-grams: " << ngram_counts_[i]; - for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { + for (NgramType::const_iterator it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { HistType h(it1->first); - for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) { + for (WordToProbsMap::const_iterator it2 = probs_[i][h].begin(); it2 != probs_[i][h].end(); ++it2) { size_ngrams++; // number of words given } } @@ -307,10 +294,10 @@ void ArpaSampling::TestReadingModel() { KALDI_LOG << "Assert sum of unigram probs equal to 1..."; BaseFloat prob_sum = 0.0; int32 count = 0; - for (auto it1 = (probs_[0]).begin(); it1 != (probs_[0]).end();++it1) { + for (NgramType::const_iterator it1 = probs_[0].begin(); it1 != probs_[0].end();++it1) { HistType h(it1->first); - for (auto it2 = (probs_[0])[h].begin(); it2 != (probs_[0])[h].end(); ++it2) { - prob_sum += 1.0 * pow(10.0, (it2->second).first); + for (WordToProbsMap::const_iterator it2 = probs_[0][h].begin(); it2 != probs_[0][h].end(); ++it2) { + prob_sum += 1.0 * pow(10.0, it2->second.first); count++; } } @@ -319,12 +306,12 @@ void ArpaSampling::TestReadingModel() { KALDI_LOG << "Assert sum of bigram probs given a history equal to 1..."; prob_sum = 0.0; - auto it1 = probs_[1].begin(); + NgramType::const_iterator it1 = probs_[1].begin(); HistType h(it1->first); for (int32 i = 0; i < num_words_; i++) { - auto it2 = probs_[1][h].find(vocab_[i].second); + WordToProbsMap::const_iterator it2 = probs_[1][h].find(vocab_[i].second); if (it2 != probs_[1][h].end()) { - prob_sum += 1.0 * pow(10, (it2->second).first); + prob_sum += 1.0 * pow(10, it2->second.first); } else { prob_sum += pow(10, GetProb(2, vocab_[i].second, h)); } @@ -332,39 +319,17 @@ void ArpaSampling::TestReadingModel() { KALDI_LOG << "Sum of bigram probs given a history equal to " << prob_sum; } -// Read sentences from a file -void ArpaSampling::ReadSentences(std::istream &iss, std::vector >* sentences) { - const fst::SymbolTable* sym = Symbols(); - std::string line; - KALDI_LOG << "Start reading sentences..."; - while (getline(iss, line)) { - std::istringstream is(line); - std::istream_iterator begin(is), end; - std::vector tokens(begin, end); - std::vector sentence; - int32 word; - int32 bos = sym->Find(bos_symbol_); - sentence.push_back(bos); - for (int32 i = 0; i < tokens.size(); i++) { - word = sym->Find(tokens[i]); - if (word == fst::SymbolTable::kNoSymbol) { - word = sym->Find(unk_symbol_); - } - sentence.push_back(word); - } - int32 eos = sym->Find(eos_symbol_); - sentence.push_back(eos); - (*sentences).push_back(sentence); - } - KALDI_LOG << "Finished reading sentences."; +int32 ArpaSampling::GetNgramOrder() { + return ngram_order_; } // Read histories of integers from a file -void ArpaSampling::ReadHistories(std::istream &is, bool binary) { +std::vector ArpaSampling::ReadHistories(std::istream &is, bool binary) { if (binary) { KALDI_ERR << "binary-mode reading is not implemented for ArpaFileParser"; } const fst::SymbolTable* sym = Symbols(); + std::vector histories; std::string line; KALDI_LOG << "Start reading histories from file..."; while (getline(is, line)) { @@ -381,13 +346,14 @@ void ArpaSampling::ReadHistories(std::istream &is, bool binary) { history.push_back(word); } if (history.size() >= ngram_order_) { - std::reverse(history.begin(), history.end()); - history.resize(ngram_order_ - 1); - std::reverse(history.begin(), history.end()); + HistType h(history.end() - ngram_order_ + 1, history.end()); + history.clear(); + HistType history = h; } - histories_.push_back(history); + histories.push_back(history); } KALDI_LOG << "Finished reading histories from file."; + return histories; } } // end of kaldi diff --git a/src/rnnlm/arpa-sampling.h b/src/rnnlm/arpa-sampling.h index 1fdedeb573a..5f80ca308a5 100644 --- a/src/rnnlm/arpa-sampling.h +++ b/src/rnnlm/arpa-sampling.h @@ -24,6 +24,7 @@ #include #include "lm/arpa-file-parser.h" #include "fst/fstlib.h" +#include "util/common-utils.h" #include #include @@ -39,32 +40,16 @@ namespace kaldi { typedef int32_t int32; -/// A hashing function-object for vectors of ints. -struct IntVectorHasher { // hashing function for vector. - size_t operator()(const std::vector &x) const { - size_t ans = 0; - typename std::vector::const_iterator iter = x.begin(), end = x.end(); - for (; iter != end; ++iter) { - ans *= kPrime; - ans += *iter; - } - return ans; - } - private: - static const int kPrime = 7853; -}; - -// Predefine some symbol values, because any integer is as good than any other. enum { kEps = 0, - // kDisambig, + kDisambig, kBos, kEos, kUnk }; typedef std::vector HistType; typedef unordered_map > WordToProbsMap; -typedef unordered_map NgramType; -typedef unordered_map HistWeightsType; +typedef unordered_map > NgramType; +typedef unordered_map > HistWeightsType; class ArpaSampling : public ArpaFileParser { public: @@ -77,25 +62,34 @@ class ArpaSampling : public ArpaFileParser { eos_symbol_ = ""; unk_symbol_ = ""; } - // Compute the probability of a given sentence with ngram_order LM - BaseFloat ComputeSentenceProb(const std::vector& test_sentence); - // Test the read-in model by computing probs of all sentences with ngram_order LM - BaseFloat ComputeAllSentencesProb(const std::vector >& test_sentences); + // This function returns the log probability of a ngram term from the ARPA LM + // if it is found; it backoffs to the lower order model when the ngram term + // does not exist. + BaseFloat GetProb(int32 order, int32 word, const HistType& history); + + // Get the back-off weight of a ngram in the read-in model + BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history); + + // Compute non-unigram output words and corresponding probs for given histories + void ComputeOutputWords(std::vector histories, + unordered_map* pdf_w); + + // Compute weighted pdf given all histories + void ComputeWeightedPdf(HistWeightsType hists_weights, + std::vector >* weighted_pdf); + // Get ngram order + int32 GetNgramOrder(); + void TestReadingModel(); void TestProbs(std::istream &is, bool binary); void TestPdfsEqual(); - // print history - void PrintHist(const HistType& h); - - void ReadHistories(std::istream &is, bool binary); + std::vector ReadHistories(std::istream &is, bool binary); - void ReadSentences(std::istream &is, std::vector >* sentences); - protected: // ArpaFileParser overrides. virtual void HeaderAvailable(); @@ -103,26 +97,16 @@ class ArpaSampling : public ArpaFileParser { virtual void ReadComplete() {} private: - // This function returns the log probability of a ngram term from the ARPA LM - // if it is found; it backoffs to the lower order model when the ngram term - // does not exist. - BaseFloat GetProb(int32 order, int32 word, const HistType& history); - - // Get the back-off weight of a ngram in the read-in model - BaseFloat GetBackoffWeight(int32 order, int32 word, const HistType& history); - // For test: randomly generate histories - void RandomGenerateHistories(); + std::vector RandomGenerateHistories(); // Compute a pdf of words in the vocab given a history - void ComputeWordPdf(const HistType& history, std::vector >* pdf); + void ComputeWordPdf(const HistType& history, + std::vector >* pdf); // Compute weights of given histories - void ComputeHistoriesWeights(); + HistWeightsType ComputeHistoriesWeights(std::vector histories); - // Compute weighted pdf given all histories - void ComputeWeightedPdf(std::vector >* weighted_pdf); - // N-gram order of the read-in LM. int32 ngram_order_; @@ -150,9 +134,6 @@ class ArpaSampling : public ArpaFileParser { // Histories' weights HistWeightsType hists_weights_; - // The given N Histories - std::vector histories_; - // Test sentences std::vector > sentences_; }; diff --git a/src/rnnlm/rnnlm-utils-test.cc b/src/rnnlm/rnnlm-utils-test.cc index 0f5673b2035..b081d878512 100644 --- a/src/rnnlm/rnnlm-utils-test.cc +++ b/src/rnnlm/rnnlm-utils-test.cc @@ -204,16 +204,21 @@ int main(int argc, char **argv) { mdl.TestReadingModel(); Input k2(history_file, &binary); - mdl.ReadHistories(k2.Stream(), binary); + std::vector histories; + histories = mdl.ReadHistories(k2.Stream(), binary); + unordered_map pdf_hist_weight; + mdl.ComputeOutputWords(histories, &pdf_hist_weight); // command for running the test binary: ./test-binary arpa-file history-file // arpa-file is the ARPA-format language model // history-file has lines of histories, one history per line // this test can be slow + /* KALDI_LOG << "Start weighted histories test..."; for (int i = 0; i < N / 100; i++) { mdl.TestPdfsEqual(); } KALDI_LOG << "Successfuly pass the test."; + */ return 0; }