From ccbb93c262dcd428707042a11bc2b1dafb693487 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 26 Aug 2019 18:43:31 -0400 Subject: [PATCH] [src,doc] Fix bug in new option of post-to-phone-post; skeleton of faq page --- src/bin/post-to-phone-post.cc | 17 +-- src/doc/faq.dox | 41 ++++++ src/doc/kws.dox | 250 ++-------------------------------- src/doc/mainpage.dox | 1 + 4 files changed, 61 insertions(+), 248 deletions(-) create mode 100644 src/doc/faq.dox diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc index 7e91fc5c783..871f03a91a1 100644 --- a/src/bin/post-to-phone-post.cc +++ b/src/bin/post-to-phone-post.cc @@ -48,7 +48,7 @@ int main(int argc, char *argv[]) { po.Register("transition-id-counts", &tacc_rxfilename, "Rxfilename where vector of counts\n" "for transition-ids can be read (would normally come from training data\n" - "alignments, e.g. from ali-to-post and then post-to-tacc with --per-pdf=false\n"); + "alignments, e.g. from ali-to-post and then post-to-tacc with --per-pdf=false)\n"); po.Read(argc, argv); @@ -109,15 +109,16 @@ int main(int argc, char *argv[]) { BaseFloat denominator = 0.0; for (auto p: pdf_to_phones[i]) denominator += p.second; - for (auto q: pdf_to_phones[i]) { + for (auto iter = pdf_to_phones[i].begin(); iter != pdf_to_phones[i].end(); + ++iter) { if (denominator != 0.0) - q.second /= denominator; + iter->second /= denominator; else - q.second = 1.0 / pdf_to_phones[i].size(); + iter->second = 1.0 / pdf_to_phones[i].size(); } } - // Input is transition-ids + // Input is pdf-ids for (; !posterior_reader.Done(); posterior_reader.Next()) { const kaldi::Posterior &posterior = posterior_reader.Value(); int32 T = posterior.size(); @@ -125,7 +126,7 @@ int main(int argc, char *argv[]) { std::unordered_map phone_to_count; for (int32 t = 0; t < T; t++) { phone_to_count.clear(); - for (auto p : phone_posterior[t]) { + for (auto p : posterior[t]) { int32 pdf_id = p.first; BaseFloat count = p.second; if (pdf_id < 0 || pdf_id >= num_pdfs) @@ -134,7 +135,8 @@ int main(int argc, char *argv[]) { for (auto q: pdf_to_phones[pdf_id]) { int32 phone = q.first; BaseFloat prob = q.second; - phone_to_count[phone] += count * prob; + if (prob != 0.0) + phone_to_count[phone] += count * prob; } } for (auto p : phone_to_count) { @@ -154,4 +156,3 @@ int main(int argc, char *argv[]) { return -1; } } - diff --git a/src/doc/faq.dox b/src/doc/faq.dox new file mode 100644 index 00000000000..f0b08c156b7 --- /dev/null +++ b/src/doc/faq.dox @@ -0,0 +1,41 @@ +// doc/kws.dox + + +// Copyright 2013 Johns Hopkins University (author: Guoguo Chen) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 + +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +namespace kaldi { + +/** + \page faq Frequently Asked Questions + + \section faq_intro Introduction + + + This page contains the answers to some miscellaneous frequently asked + questions from the mailing lists. This should not be your primary way of + finding such answers: the mailing lists and github contain many more + discussions, and a web search may be the easiest way to find answers. + + + \section why_is_the_sky_blue Why is the sky blue? + + Answer to example question goes here. + This is just a template. + +*/ +} diff --git a/src/doc/kws.dox b/src/doc/kws.dox index 13f7e0dd508..7bfceaf3675 100644 --- a/src/doc/kws.dox +++ b/src/doc/kws.dox @@ -1,7 +1,7 @@ -// doc/kws.dox +// doc/faq.dox -// Copyright 2013 Johns Hopkins University (author: Guoguo Chen) +// Copyright 2019 Daniel Povey // See ../../COPYING for clarification regarding multiple authors // @@ -21,251 +21,21 @@ namespace kaldi { /** - \page kws Keyword Search in Kaldi + \page faq Frequently Asked Questions - \section kws_intro Introduction - This page describes the keyword search module in Kaldi. Our implementation - includes the following features: - - - Lattice indexing for fast keyword retrieval. - - Proxy keywords to handle out-of-vocabulary (OOV) problem. - - In the following document, we will focus on word level keyword search for - simplicity purpose, but our implementation naturally supports word level as - well as subword level keyword search -- both our LVCSR module and the KWS - module are implemented using weighted finite state transducer (WFST), and the - algorithm should work as long as the symbol table properly maps words/subwords - to integers. - - The rest of this document is organized as follows: in section \ref kws_system - "Typical Kaldi KWS system", we describe the basic components of a Kaldi KWS - system; in section \ref kws_proxy "Proxy keywords", we explain how we use - proxy keywords to handle the keywords that are not in the vocabulary; finally - in section \ref kws_scripts "Babel scripts", we walk through the KWS related - scripts we created for IARPA Babel project. - - \section kws_system Typical Kaldi KWS system - An example of a Kaldi KWS system can be found in this paper "Quantifying - the Value of Pronunciation Lexicons for Keyword Search in Low Resource - Languages", G. Chen, S. Khudanpur, D. Povey, J. Trmal, D. Yarowsky and - O. Yilmaz. Generally, a KWS system consists of two parts: a LVCSR module - that decodes the search collection and generates corresponding lattices, and a - KWS module that makes index for the lattices and searches the keywords from - the generated index. - - Our basic LVCSR system is a SGMM + MMI system. We use standard PLP analysis to - extract 13 dimensional acoustic features, and follow a typical maximum - likelihood acoustic training recipe, beginning with a flat-start - initialization of context-independent phonetic HMMs, and ending with speaker - adaptive training (SAT) of state-clustered triphone HMMs with GMM output - densities. This is followed by the training of a universal background model - from speaker-transformed training data, which is then used to train a subspace - Gaussian mixture model (SGMM) for the HMM emission probabilities. Finally, all - the training speech is decoded using the SGMM system, and boosted maximum - mutual information (BMMI) training of the SGMM parameters is performed. More - details can be found in egs/babel/s5b/run-1-main.sh. - - We also build additional systems besides the basic SGMM + MMI system. For - example, a hybrid deep neural network (DNN) system, details in - egs/babel/s5b/run-2a-nnet-gpu.sh, a bottleneck feature system, details in - egs/babel/s5b/run-8a-kaldi-bnf.sh, etc. All those systems decode and generate - lattices for the same search collection, which will then be sent to the KWS - module for indexing and searching. We do system combination on the retrieved - results instend of lattices. - - Lattices generated by the above LVCSR systems are processed using the lattice - indexing technique described in - "Lattice indexing for spoken term detection", D. Can, M. Saraclar, Audio, - Speech, and Language Processing. The lattices of all the utterances in the - search collection are converted from individual weighted finite state - transducers (WFST) to a single generalized factor transducer structure in - which the start-time, end-time and lattice posterior probability of each word - token is stored as a 3-dimensional cost. This factor transducer is actually an - inverted index of all word sequences seen in the lattices. Given a keyword or - phrase, we then create a simple finite state machine that accepts the - keyword/phrase and composes it with the factor transducer to obtain all - occurrences of the keyword/phrase in the search collection, along with the - utterance ID, start-time and end-time and lattice posterior probability of - each occurrence. All those occurrences are sorted according to their posterior - probabilities and a YES/NO decision is assigned to each instance using the - method proposed in the paper "Rapid and Accurate Spoken Term Detection". - - \section kws_proxy Proxy keywords - Our proxy keyword generation process has been described in this paper "Using Proxies - for OOV Keywords in the Keyword Search Task", G. Chen, O. Yilmaz, J. Trmal, - D. Povey, S. Khudanpur. We originally proposed this method to solve the - OOV problem of the word lattices -- if a keyword is not in the vocabulary of - the LVCSR system, it will not appear in the search collection lattices, even - though the keyword is actually spoken in the search collection. This is a - known problem of LVCSR based keyword search systems, and there are ways to - handle this, for example, building a subword system. Our approach is to find - acoustically similar in-vocabulary (IV) words for the OOV keyword, and use - them as proxy keywords instead of the original OOV keyword. The advantage is - that we do not have to build additional subword systems. In a upcoming - Interspeech paper "Low-Resource Open Vocabulary Keyword Search Using Point - Process Models", C. Liu, A. Jansen, G. Chen, K. Kintzley, J. Trmal, - S. Khudanpur, we show that this technique is comparable and complementary to a - phonetic search method based on point process model. Proxy keyword is one of - the fuzzy search methods, and it should also improve IV keyword performance, - although we initially brought it up to handle OOV keywords. - - The general proxy keyword generation process can be formulized as follows: - \f[ - K^\prime = \mathrm{Project} \left( - \mathrm{ShortestPath} \left( - \mathrm{Prune} \left( - \mathrm{Prune} \left(K \circ L_2 \circ E^\prime \right) - \circ L_1^{-1} \right) \right) \right) - \f] - where \f$K\f$ is the original keyword, \f$L_2\f$ is a lexicon that contains - the pronunciation of \f$K\f$. If \f$K\f$ is out of vocabulary, this lexicon - can be obtained by using G2P tools such as Sequitur. \f$E^\prime\f$ is the - edit distance transducer that contains the phone confusions collectioned from - training set, and \f$L_1\f$ is the original lexicon. \f$K^\prime\f$ is then a - WFST that contains several IV words that are acoustically similar to the - original keyword \f$K\f$. We plug it into the search pipeline "as if" it was - the original keyword. - Note that the two pruning stages are essential, especially when you have a - very large vocabulary. We also implemented a lazy-composition algorithm that - only generates composed states as needed (i.e., does not generate states that - will be pruned away later). This avoids blowing up the memory when composing - \f$K \circ L_2 \circ E^\prime\f$ with \f$L_1^{-1}\f$. - - \section kws_scripts Babel scripts - - \subsection kws_scripts_highlevel A highlevel look - We have set up the "push-button" scripts for IARPA Babel project. If you are - working on Babel and want to use our scripts, you can build a SGMM + MMI - keyword search system in the following steps (assume you are in working - directory egs/babel/s5b/): - - Install F4DE and put it in your path.sh - - Modify your cmd.sh so that it can run on your cluster - - Link one of the config files in conf/languages to ./lang.conf, e.g., - "ln -s conf/languages/105-turkish-limitedLP.official.conf lang.conf" - - Modify lang.conf to point to your files instead the ones on JHU cluster - - Run run-1-main.sh, which builds the LVCSR system - - Run run-2-segmentation.sh, which generates segmentation for eval data - - Run run-4-anydecode.sh, which decodes the eval data, makes the index and - searches the keywords - - Similarly, you can build DNN systems, BNF systems, Semi-supervised systems, - etc. The KWS stuff happens in run-4-anydecode.sh. We will have a detailed look - of how to do keyword search below, in case you want to do keyword search for - some other resources. We assume that you have decoded your search collection - and generated the corresponding lattices. - - \subsection kws_scripts_dataprep Prepare KWS data - Typically, we generate KWS data directories under the search collection data - directory. For example, if you have a search collection called dev10h.uem, you - will have a data directory for it called data/dev10h.uem/. We create KWS data - directories under this directory, e.g., data/dev10h.uem/kws/. Before creating - KWS data directories, you have to get three files ready by hand: a ecf file - that contains the search collection information, a kwlist file that lists all - the keywords and a rttm file for scoring. Sometimes you may have to prepare - those files by yourself, for example, you can generate the rttm file by force - aligning the search collection with a trained model. Below we show the format - of those files. - - Example ECF file: - \verbatim - - - - \endverbatim - - Example KWLIST file: - \verbatim - - - செய்றத - - - சொல்லுவியா - - - \endverbatim - - Example RTTM file: - \verbatim - SPEAKER YOUR_AUDIO_FILENAME 1 5.87 0.370 spkr1 - LEXEME YOUR_AUDIO_FILENAME 1 5.87 0.370 ஹலோ lex spkr1 0.5 - SPEAKER YOUR_AUDIO_FILENAME 1 8.78 2.380 spkr1 - LEXEME YOUR_AUDIO_FILENAME 1 8.78 0.300 உம்ம் lex spkr1 0.5 - LEXEME YOUR_AUDIO_FILENAME 1 9.08 0.480 அதான் lex spkr1 0.5 - LEXEME YOUR_AUDIO_FILENAME 1 9.56 0.510 சரியான lex spkr1 0.5 - LEXEME YOUR_AUDIO_FILENAME 1 10.07 0.560 மெசேஜ்டா lex spkr1 0.5 - LEXEME YOUR_AUDIO_FILENAME 1 10.63 0.350 சான்ஸே lex spkr1 0.5 - LEXEME YOUR_AUDIO_FILENAME 1 10.98 0.180 இல்லயே lex spkr1 0.5 - \endverbatim - - With the above three files ready, you can start preparing KWS data directory. - If you just want to do a basic keyword search, running the following should be - enough: - \verbatim - local/kws_setup.sh \ - --case_insensitive $case_insensitive \ - --rttm-file $my_rttm_file \ - $my_ecf_file $my_kwlist_file data/lang $dataset_dir - \endverbatim + \section kws_intro Introduction - If you want to do fuzzy search for your OOV keywords, you can run the - following few commands, which first collects the phone confusions, and trains - a G2P model, and then creates the KWS data directory: - \verbatim - #Generate the confusion matrix - #NB, this has to be done only once, as it is training corpora dependent, - #instead of search collection dependent - if [ ! -f exp/conf_matrix/.done ] ; then - local/generate_confusion_matrix.sh --cmd "$decode_cmd" --nj $my_nj \ - exp/sgmm5/graph exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats exp/conf_matrix - touch exp/conf_matrix/.done - fi - confusion=exp/conf_matrix/confusions.txt + This page contains the answers to some miscellaneous frequently asked questions + from the mailing lists. This should not be your primary way of finding such + answers: the mailing lists and github contain many more discussions, and a + web search may be the easiest way to find answers. - if [ ! -f exp/g2p/.done ] ; then - local/train_g2p.sh data/local exp/g2p - touch exp/g2p/.done - fi - local/apply_g2p.sh --nj $my_nj --cmd "$decode_cmd" \ - --var-counts $g2p_nbest --var-mass $g2p_mass \ - $kwsdatadir/oov.txt exp/g2p $kwsdatadir/g2p - L2_lex=$kwsdatadir/g2p/lexicon.lex - L1_lex=data/local/lexiconp.txt - local/kws_data_prep_proxy.sh \ - --cmd "$decode_cmd" --nj $my_nj \ - --case-insensitive true \ - --confusion-matrix $confusion \ - --phone-cutoff $phone_cutoff \ - --pron-probs true --beam $beam --nbest $nbest \ - --phone-beam $phone_beam --phone-nbest $phone_nbest \ - data/lang $data_dir $L1_lex $L2_lex $kwsdatadir - \endverbatim + \section Example question? - \subsection kws_scripts_index_and_search Indexing and searching - At this stage we assume you have decoded your search collection and generated - the corresponding lattices. Running the following script will take care of - indexing and searching: - \verbatim - local/kws_search.sh --cmd "$cmd" \ - --max-states ${max_states} --min-lmwt ${min_lmwt} \ - --max-lmwt ${max_lmwt} --skip-scoring $skip_scoring \ - --indices-dir $decode_dir/kws_indices $lang_dir $data_dir $decode_dir - \endverbatim + Answer to example question - If your KWS data directory has an extra ID, e.g., oov (this is useful when you - have different KWS setups, in this case, your directory will look something - like data/dev10h.uem/kws_oov), you have to run it with the extraid option: - \verbatim - local/kws_search.sh --cmd "$cmd" --extraid $extraid \ - --max-states ${max_states} --min-lmwt ${min_lmwt} \ - --max-lmwt ${max_lmwt} --skip-scoring $skip_scoring \ - --indices-dir $decode_dir/kws_indices $lang_dir $data_dir $decode_dir - \endverbatim */ } diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox index 88fefbd8e02..1f09ba5e019 100644 --- a/src/doc/mainpage.dox +++ b/src/doc/mainpage.dox @@ -46,6 +46,7 @@ - \subpage tutorial - \subpage kaldi_for_dummies - \subpage examples + - \subpage faq - \subpage glossary - \subpage data_prep - \subpage build_setup