Conversation
src/rnnlm/sample_a_word.cc
Outdated
| if (tokens.size() == 0) continue; | ||
| if (tokens.size() == 2 && tokens[0] == "ngram") { | ||
| std::string substring = tokens[1].substr(2); | ||
| int32 count = std::stoi(substring); // get "123456" from "1=123456" |
src/rnnlm/sample_a_word.cc
Outdated
| std::cout << "OOV found in history: " << tokens[i] << std::endl; | ||
| } | ||
| } | ||
| assert (history.size() == order_current - 1); |
src/rnnlm/sample_a_word.cc
Outdated
| if (it != vocab_.end()) { | ||
| history.push_back(it->second); | ||
| } else { | ||
| std::cout << "OOV found in history: " << tokens[i] << std::endl; |
src/rnnlm/sample_a_word.cc
Outdated
|
|
||
| float NgramModel::GetProb(int32 order, const int32 word, const HistType& history) { | ||
| float prob = 0.0; | ||
| auto it = probs_[order - 1].find(history); |
There was a problem hiding this comment.
no auto.
OK, please check ALL my comments on your previous pull requests. You're making the same mistakes again.
src/rnnlm/sample_a_word.cc
Outdated
| void NgramModel::ReadARPAModel(char* file) { | ||
| std::ifstream data_input(file); | ||
| if (!data_input.is_open()) { | ||
| std::cerr << "error opening '" << file |
hainan-xv
left a comment
There was a problem hiding this comment.
OK, I have seen a lot of issues for which I commented before but you're still doing. Please fix them and I will do another review.
src/rnnlm/arpa-sampling.cc
Outdated
| if (u >= 0 && u < cdf[1].second) { | ||
| return cdf[0].first; | ||
| } | ||
| for (int32 i = 1; i < num_words_; i++) { |
There was a problem hiding this comment.
change this to a binary search. Check my SelectOne() function in rnnlm-utils.cc for reference if you need.
src/rnnlm/sample_a_word.cc
Outdated
| probs = std::make_pair(lower, upper); | ||
| cdf.push_back(probs); | ||
| } | ||
| float u = 1.0 * rand()/RAND_MAX; |
|
Hi Hainan, you can ignore the oldest commit. Sorry I can not delete it. |
src/rnnlm/arpa-sampling.h
Outdated
| typedef int32_t int32; | ||
|
|
||
| /// A hashing function-object for vectors of ints. | ||
| struct IntVectorHasher { // hashing function for vector<Int>. |
There was a problem hiding this comment.
Is this copied from another file in kaldi?
There was a problem hiding this comment.
Oh this is just an implementation of the VectorHasher in Kaldi. I will just import that file.
| typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType; | ||
| typedef unordered_map<HistType, BaseFloat, IntVectorHasher> HistWeightsType; | ||
|
|
||
| class ArpaSampling : public ArpaFileParser { |
There was a problem hiding this comment.
You need to move all testing-related function/variable out of the class. Write them in the test cc file.
There was a problem hiding this comment.
What I mean is you should not write a testing function as a member function of a class.
| const char *usage = ""; | ||
| ParseOptions po(usage); | ||
| po.Read(argc, argv); | ||
| std::string arpa_file = po.GetArg(1), history_file = po.GetArg(2); |
There was a problem hiding this comment.
this is probably OK now but you need to change it so it doesn't need cmd arguments
src/rnnlm/arpa-sampling.cc
Outdated
|
|
||
| BaseFloat ArpaSampling::GetProb(int32 order, int32 word, const HistType& history) { | ||
| BaseFloat prob = 0.0; | ||
| auto it = probs_[order - 1].find(history); |
|
|
||
| namespace kaldi { | ||
|
|
||
| void ArpaSampling::ConsumeNGram(const NGram& ngram) { |
There was a problem hiding this comment.
please add comments to functions to describe what they do
src/rnnlm/arpa-sampling.cc
Outdated
| BaseFloat bow = 0.0; | ||
| auto it = probs_[order - 1].find(history); | ||
| if (it != probs_[order - 1].end()) { | ||
| auto it2 = probs_[order - 1][history].find(word); |
src/rnnlm/arpa-sampling.cc
Outdated
| } | ||
|
|
||
| void ArpaSampling::PrintHist(const HistType& h) { | ||
| KALDI_LOG << "Current hist is: "; |
There was a problem hiding this comment.
run this function and you'll see that it has a problem.
| std::string unk_symbol_; | ||
|
|
||
| // Vocab | ||
| std::vector<std::pair<std::string, int32> > vocab_; |
There was a problem hiding this comment.
why do you need this instead of a SymbolTable?
There was a problem hiding this comment.
So should I use SymbolTable as vocab?
src/rnnlm/arpa-sampling.h
Outdated
| HistWeightsType hists_weights_; | ||
|
|
||
| // The given N Histories | ||
| std::vector<HistType> histories_; |
There was a problem hiding this comment.
you should NOT store histories_ in this class.
There was a problem hiding this comment.
This class should only store information about the ngram model (read from the arpa file). Histories should just be a paramter you pass in order to get the prob-distributions.
src/rnnlm/arpa-sampling.cc
Outdated
| void ArpaSampling::ComputeWeightedPdf(std::vector<std::pair<int32, BaseFloat> >* pdf_w) { | ||
| BaseFloat prob = 0; | ||
| (*pdf_w).clear(); | ||
| (*pdf_w).resize(num_words_); // if do not do this, (*pdf_w)[word] += prob will get seg fault |
There was a problem hiding this comment.
delete this comment. it's so obvious
src/rnnlm/arpa-sampling.cc
Outdated
| history.push_back(word); | ||
| } | ||
| if (history.size() >= ngram_order_) { | ||
| std::reverse(history.begin(), history.end()); |
There was a problem hiding this comment.
this is an extremely inefficient way of doing things. please make it more efficient.
src/rnnlm/arpa-sampling.cc
Outdated
| KALDI_LOG << "Expected number of " << (i + 1) << "-grams: " << ngram_counts_[i]; | ||
| for (auto it1 = probs_[i].begin(); it1 != probs_[i].end(); ++it1) { | ||
| HistType h(it1->first); | ||
| for (auto it2 = (probs_[i])[h].begin(); it2 != (probs_[i])[h].end(); ++it2) { |
There was a problem hiding this comment.
no need to do (v[i])[j] --- just use v[i][j]
src/rnnlm/arpa-sampling.cc
Outdated
| if (it != probs_[order].end()) { | ||
| auto it2 = probs_[order][history].find(word); | ||
| if (it2 != probs_[order][history].end()) { | ||
| prob = pow(10, (it2->second).first); |
There was a problem hiding this comment.
no need to do (i->second).first -- just do i->second.first
| if (it2 != probs_[order][history].end()) { | ||
| prob = pow(10, (it2->second).first); | ||
| (*pdf)[i].first = word; | ||
| (*pdf)[i].second += prob; |
There was a problem hiding this comment.
i'm very confused why you do += prob
src/rnnlm/arpa-sampling.cc
Outdated
| int32 word_new = history.back(); | ||
| HistType::const_iterator last_new = history.end() - 1; | ||
| HistType h_new(history.begin(), last_new); | ||
| prob = pow(10, GetBackoffWeight(order, word_new, h_new)) * |
There was a problem hiding this comment.
pow(10, a + b) would be better than pow(10, a) * pow(10, b)
src/rnnlm/arpa-sampling.h
Outdated
| static const int kPrime = 7853; | ||
| }; | ||
|
|
||
| // Predefine some symbol values, because any integer is as good than any other. |
src/rnnlm/arpa-sampling.h
Outdated
| typedef std::vector<int32> HistType; | ||
| typedef unordered_map<int32, std::pair<BaseFloat, BaseFloat> > WordToProbsMap; | ||
| typedef unordered_map<HistType, WordToProbsMap, IntVectorHasher> NgramType; | ||
| typedef unordered_map<HistType, BaseFloat, IntVectorHasher> HistWeightsType; |
There was a problem hiding this comment.
OK change this. I will tell you how.
…sts_weights as class members
|
|
||
| // this function computes history weights for given histories | ||
| // the total weights of histories is 1 | ||
| HistWeightsType ArpaSampling::ComputeHistoriesWeights(std::vector<HistType> histories) { |
| } | ||
|
|
||
| // Read histories of integers from a file | ||
| std::vector<HistType> ArpaSampling::ReadHistories(std::istream &is, bool binary) { |
There was a problem hiding this comment.
need to change to a void function and move the return into argument list as poitner
could do this later
hainan-xv
left a comment
There was a problem hiding this comment.
OK I will merge this now. Just remember there is a couple TODOs:
- moving the test code out of the class
- making the test binary not require arguments
- having separate maps for n-gram probs and backff weights
No description provided.