kaldi-asr · danpovey · Aug 26, 2019 · Aug 26, 2019 · galv · Aug 27, 2019
diff --git a/src/bin/post-to-phone-post.cc b/src/bin/post-to-phone-post.cc
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
 
     po.Register("transition-id-counts", &tacc_rxfilename, "Rxfilename where vector of counts\n"
                 "for transition-ids can be read (would normally come from training data\n"
-                "alignments, e.g. from ali-to-post and then post-to-tacc with --per-pdf=false\n");
+                "alignments, e.g. from ali-to-post and then post-to-tacc with --per-pdf=false)\n");
 
     po.Read(argc, argv);
 
@@ -109,23 +109,24 @@ int main(int argc, char *argv[]) {
         BaseFloat denominator = 0.0;
         for (auto p: pdf_to_phones[i])
           denominator += p.second;
-        for (auto q: pdf_to_phones[i]) {
+        for (auto iter = pdf_to_phones[i].begin(); iter != pdf_to_phones[i].end();
+             ++iter) {
           if (denominator != 0.0)
-            q.second /= denominator;
+            iter->second /= denominator;
           else
-            q.second = 1.0 / pdf_to_phones[i].size();
+            iter->second = 1.0 / pdf_to_phones[i].size();
         }
       }
 
-      // Input is transition-ids
+      // Input is pdf-ids
       for (; !posterior_reader.Done(); posterior_reader.Next()) {
         const kaldi::Posterior &posterior = posterior_reader.Value();
         int32 T = posterior.size();
         kaldi::Posterior phone_posterior(T);
         std::unordered_map<int32, BaseFloat> phone_to_count;
         for (int32 t = 0; t < T; t++) {
           phone_to_count.clear();
-          for (auto p : phone_posterior[t]) {
+          for (auto p : posterior[t]) {
             int32 pdf_id = p.first;
             BaseFloat count = p.second;
             if (pdf_id < 0 || pdf_id >= num_pdfs)
@@ -134,7 +135,8 @@ int main(int argc, char *argv[]) {
             for (auto q: pdf_to_phones[pdf_id]) {
               int32 phone = q.first;
               BaseFloat prob = q.second;
-              phone_to_count[phone] += count * prob;
+              if (prob != 0.0)
+                phone_to_count[phone] += count * prob;
             }
           }
           for (auto p : phone_to_count) {
@@ -154,4 +156,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/doc/faq.dox b/src/doc/faq.dox
@@ -0,0 +1,41 @@
+// doc/kws.dox
+
+
+// Copyright 2013  Johns Hopkins University (author: Guoguo Chen)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+namespace kaldi {
+
+/**
+  \page faq  Frequently Asked Questions
+
+  \section faq_intro    Introduction
+
+
+  This page contains the answers to some miscellaneous frequently asked
+  questions from the mailing lists. This should not be your primary way of
+  finding such answers: the mailing lists and github contain many more
+  discussions, and a web search may be the easiest way to find answers.
+
+
+ \section why_is_the_sky_blue  Why is the sky blue?
+
+  Answer to example question goes here.
+   This is just a template.
+
+*/
+}
diff --git a/src/doc/kws.dox b/src/doc/kws.dox
@@ -1,7 +1,7 @@
-// doc/kws.dox
+// doc/faq.dox
 
 
-// Copyright 2013  Johns Hopkins University (author: Guoguo Chen)
+// Copyright 2019  Daniel Povey
 
 // See ../../COPYING for clarification regarding multiple authors
 //
@@ -21,251 +21,21 @@
 namespace kaldi {
 
 /**
-  \page kws Keyword Search in Kaldi
+  \page faq   Frequently Asked Questions
 
-  \section kws_intro Introduction
-  This page describes the keyword search module in Kaldi. Our implementation
-  includes the following features:
-
-  - Lattice indexing for fast keyword retrieval.
-  - Proxy keywords to handle out-of-vocabulary (OOV) problem.
-
-  In the following document, we will focus on word level keyword search for
-  simplicity purpose, but our implementation naturally supports word level as
-  well as subword level keyword search -- both our LVCSR module and the KWS
-  module are implemented using weighted finite state transducer (WFST), and the
-  algorithm should work as long as the symbol table properly maps words/subwords
-  to integers.
-
-  The rest of this document is organized as follows: in section \ref kws_system
-  "Typical Kaldi KWS system", we describe the basic components of a Kaldi KWS
-  system; in section \ref kws_proxy "Proxy keywords", we explain how we use
-  proxy keywords to handle the keywords that are not in the vocabulary; finally
-  in section \ref kws_scripts "Babel scripts", we walk through the KWS related
-  scripts we created for IARPA Babel project.
-
-  \section kws_system Typical Kaldi KWS system
-  An example of a Kaldi KWS system can be found in this paper <a href=
-  http://www.clsp.jhu.edu/~guoguo/papers/icassp2013_lexicon_value.pdf> "Quantifying
-  the Value of Pronunciation Lexicons for Keyword Search in Low Resource
-  Languages", G. Chen, S. Khudanpur, D. Povey, J. Trmal, D. Yarowsky and
-  O. Yilmaz</a>. Generally, a KWS system consists of two parts: a LVCSR module
-  that decodes the search collection and generates corresponding lattices, and a
-  KWS module that makes index for the lattices and searches the keywords from
-  the generated index.
-
-  Our basic LVCSR system is a SGMM + MMI system. We use standard PLP analysis to
-  extract 13 dimensional acoustic features, and follow a typical maximum
-  likelihood acoustic training recipe, beginning with a flat-start
-  initialization of context-independent phonetic HMMs, and ending with speaker
-  adaptive training (SAT) of state-clustered triphone HMMs with GMM output
-  densities. This is followed by the training of a universal background model
-  from speaker-transformed training data, which is then used to train a subspace
-  Gaussian mixture model (SGMM) for the HMM emission probabilities. Finally, all
-  the training speech is decoded using the SGMM system, and boosted maximum
-  mutual information (BMMI) training of the SGMM parameters is performed. More
-  details can be found in egs/babel/s5b/run-1-main.sh.
-
-  We also build additional systems besides the basic SGMM + MMI system. For
-  example, a hybrid deep neural network (DNN) system, details in
-  egs/babel/s5b/run-2a-nnet-gpu.sh, a bottleneck feature system, details in
-  egs/babel/s5b/run-8a-kaldi-bnf.sh, etc. All those systems decode and generate
-  lattices for the same search collection, which will then be sent to the KWS
-  module for indexing and searching. We do system combination on the retrieved
-  results instend of lattices.
-
-  Lattices generated by the above LVCSR systems are processed using the lattice
-  indexing technique described in <a href=
-  https://wiki.inf.ed.ac.uk/twiki/pub/CSTR/ListenSemester2201314/taslp_2011.pdf>
-  "Lattice indexing for spoken term detection", D. Can, M. Saraclar, Audio,
-  Speech, and Language Processing</a>. The lattices of all the utterances in the
-  search collection are converted from individual weighted finite state
-  transducers (WFST) to a single generalized factor transducer structure in
-  which the start-time, end-time and lattice posterior probability of each word
-  token is stored as a 3-dimensional cost. This factor transducer is actually an
-  inverted index of all word sequences seen in the lattices. Given a keyword or
-  phrase, we then create a simple finite state machine that accepts the
-  keyword/phrase and composes it with the factor transducer to obtain all
-  occurrences of the keyword/phrase in the search collection, along with the
-  utterance ID, start-time and end-time and lattice posterior probability of
-  each occurrence. All those occurrences are sorted according to their posterior
-  probabilities and a YES/NO decision is assigned to each instance using the
-  method proposed in the paper "Rapid and Accurate Spoken Term Detection".
-
-  \section kws_proxy Proxy keywords
-  Our proxy keyword generation process has been described in this paper <a href=
-  http://www.clsp.jhu.edu/~guoguo/papers/asru2013_proxy_keyword.pdf> "Using Proxies
-  for OOV Keywords in the Keyword Search Task", G. Chen, O. Yilmaz, J. Trmal,
-  D. Povey, S. Khudanpur</a>. We originally proposed this method to solve the
-  OOV problem of the word lattices -- if a keyword is not in the vocabulary of
-  the LVCSR system, it will not appear in the search collection lattices, even
-  though the keyword is actually spoken in the search collection. This is a
-  known problem of LVCSR based keyword search systems, and there are ways to
-  handle this, for example, building a subword system. Our approach is to find
-  acoustically similar in-vocabulary (IV) words for the OOV keyword, and use
-  them as proxy keywords instead of the original OOV keyword. The advantage is
-  that we do not have to build additional subword systems. In a upcoming
-  Interspeech paper "Low-Resource Open Vocabulary Keyword Search Using Point
-  Process Models", C. Liu, A. Jansen, G. Chen, K. Kintzley, J. Trmal,
-  S. Khudanpur, we show that this technique is comparable and complementary to a
-  phonetic search method based on point process model. Proxy keyword is one of
-  the fuzzy search methods, and it should also improve IV keyword performance,
-  although we initially brought it up to handle OOV keywords.
-
-  The general proxy keyword generation process can be formulized as follows:
-  \f[
-  K^\prime = \mathrm{Project} \left(
-             \mathrm{ShortestPath} \left(
-             \mathrm{Prune} \left(
-             \mathrm{Prune} \left(K \circ L_2 \circ E^\prime \right)
-             \circ L_1^{-1} \right) \right) \right)
-  \f]
 
-  where \f$K\f$ is the original keyword, \f$L_2\f$ is a lexicon that contains
-  the pronunciation of \f$K\f$. If \f$K\f$ is out of vocabulary, this lexicon
-  can be obtained by using G2P tools such as Sequitur. \f$E^\prime\f$ is the
-  edit distance transducer that contains the phone confusions collectioned from
-  training set, and \f$L_1\f$ is the original lexicon. \f$K^\prime\f$ is then a
-  WFST that contains several IV words that are acoustically similar to the
-  original keyword \f$K\f$. We plug it into the search pipeline "as if" it was
-  the original keyword.
 
-  Note that the two pruning stages are essential, especially when you have a
-  very large vocabulary. We also implemented a lazy-composition algorithm that
-  only generates composed states as needed (i.e., does not generate states that
-  will be pruned away later). This avoids blowing up the memory when composing
-  \f$K \circ L_2 \circ E^\prime\f$ with \f$L_1^{-1}\f$.
-
-  \section kws_scripts Babel scripts
-
-  \subsection kws_scripts_highlevel A highlevel look
-  We have set up the "push-button" scripts for IARPA Babel project. If you are
-  working on Babel and want to use our scripts, you can build a SGMM + MMI
-  keyword search system in the following steps (assume you are in working
-  directory egs/babel/s5b/):
-  - Install F4DE and put it in your path.sh
-  - Modify your cmd.sh so that it can run on your cluster
-  - Link one of the config files in conf/languages to ./lang.conf, e.g., 
-    "ln -s conf/languages/105-turkish-limitedLP.official.conf lang.conf"
-  - Modify lang.conf to point to your files instead the ones on JHU cluster
-  - Run run-1-main.sh, which builds the LVCSR system
-  - Run run-2-segmentation.sh, which generates segmentation for eval data
-  - Run run-4-anydecode.sh, which decodes the eval data, makes the index and
-    searches the keywords
-
-  Similarly, you can build DNN systems, BNF systems, Semi-supervised systems,
-  etc. The KWS stuff happens in run-4-anydecode.sh. We will have a detailed look
-  of how to do keyword search below, in case you want to do keyword search for
-  some other resources. We assume that you have decoded your search collection
-  and generated the corresponding lattices.
-
-  \subsection kws_scripts_dataprep Prepare KWS data
-  Typically, we generate KWS data directories under the search collection data
-  directory. For example, if you have a search collection called dev10h.uem, you
-  will have a data directory for it called data/dev10h.uem/. We create KWS data
-  directories under this directory, e.g., data/dev10h.uem/kws/. Before creating
-  KWS data directories, you have to get three files ready by hand: a ecf file
-  that contains the search collection information, a kwlist file that lists all
-  the keywords and a rttm file for scoring. Sometimes you may have to prepare
-  those files by yourself, for example, you can generate the rttm file by force
-  aligning the search collection with a trained model. Below we show the format
-  of those files.
-
-  Example ECF file:
-  \verbatim
-  <ecf source_signal_duration="483.825" language="" version="Excluded noscore regions">
-    <excerpt audio_filename="YOUR_AUDIO_FILENAME" channel="1" tbeg="0.000" dur="483.825" source_type="splitcts"/>
-  </ecf>
-  \endverbatim
-
-  Example KWLIST file:
-  \verbatim
-  <kwlist ecf_filename="ecf.xml" language="tamil" encoding="UTF-8" compareNormalize="" version="Example keywords">
-    <kw kwid="KW204-00001">
-      <kwtext>செய்றத</kwtext>
-    </kw>
-    <kw kwid="KW204-00002">
-      <kwtext>சொல்லுவியா</kwtext>
-    </kw>
-  </kwlist>
-  \endverbatim
-
-  Example RTTM file:
-  \verbatim
-  SPEAKER YOUR_AUDIO_FILENAME 1 5.87 0.370 <NA> <NA> spkr1 <NA>
-  LEXEME YOUR_AUDIO_FILENAME 1 5.87 0.370 ஹலோ lex spkr1 0.5
-  SPEAKER YOUR_AUDIO_FILENAME 1 8.78 2.380 <NA> <NA> spkr1 <NA>
-  LEXEME YOUR_AUDIO_FILENAME 1 8.78 0.300 உம்ம் lex spkr1 0.5
-  LEXEME YOUR_AUDIO_FILENAME 1 9.08 0.480 அதான் lex spkr1 0.5
-  LEXEME YOUR_AUDIO_FILENAME 1 9.56 0.510 சரியான lex spkr1 0.5
-  LEXEME YOUR_AUDIO_FILENAME 1 10.07 0.560 மெசேஜ்டா lex spkr1 0.5
-  LEXEME YOUR_AUDIO_FILENAME 1 10.63 0.350 சான்ஸே lex spkr1 0.5
-  LEXEME YOUR_AUDIO_FILENAME 1 10.98 0.180 இல்லயே lex spkr1 0.5
-  \endverbatim 
-
-  With the above three files ready, you can start preparing KWS data directory.
-  If you just want to do a basic keyword search, running the following should be
-  enough:
-  \verbatim
-  local/kws_setup.sh \
-    --case_insensitive $case_insensitive \
-    --rttm-file $my_rttm_file \
-    $my_ecf_file $my_kwlist_file data/lang $dataset_dir
-  \endverbatim 
+  \section kws_intro Introduction
 
-  If you want to do fuzzy search for your OOV keywords, you can run the
-  following few commands, which first collects the phone confusions, and trains
-  a G2P model, and then creates the KWS data directory:
-  \verbatim
-  #Generate the confusion matrix
-  #NB, this has to be done only once, as it is training corpora dependent,
-  #instead of search collection dependent
-  if [ ! -f exp/conf_matrix/.done ] ; then
-    local/generate_confusion_matrix.sh --cmd "$decode_cmd" --nj $my_nj  \
-      exp/sgmm5/graph exp/sgmm5 exp/sgmm5_ali exp/sgmm5_denlats  exp/conf_matrix
-    touch exp/conf_matrix/.done
-  fi
-  confusion=exp/conf_matrix/confusions.txt
+  This page contains the answers to some miscellaneous frequently asked questions
+  from the mailing lists.  This should not be your primary way of finding such
+  answers: the mailing lists and github contain many more discussions, and a
+  web search may be the easiest way to find answers.
 
-  if [ ! -f exp/g2p/.done ] ; then
-    local/train_g2p.sh  data/local exp/g2p
-    touch exp/g2p/.done
-  fi
-  local/apply_g2p.sh --nj $my_nj --cmd "$decode_cmd" \
-    --var-counts $g2p_nbest --var-mass $g2p_mass \
-    $kwsdatadir/oov.txt exp/g2p $kwsdatadir/g2p
-  L2_lex=$kwsdatadir/g2p/lexicon.lex
 
-  L1_lex=data/local/lexiconp.txt
-  local/kws_data_prep_proxy.sh \
-    --cmd "$decode_cmd" --nj $my_nj \
-    --case-insensitive true \
-    --confusion-matrix $confusion \
-    --phone-cutoff $phone_cutoff \
-    --pron-probs true --beam $beam --nbest $nbest \
-    --phone-beam $phone_beam --phone-nbest $phone_nbest \
-    data/lang  $data_dir $L1_lex $L2_lex $kwsdatadir
-  \endverbatim
+  \section Example question?
 
-  \subsection kws_scripts_index_and_search Indexing and searching
-  At this stage we assume you have decoded your search collection and generated
-  the corresponding lattices. Running the following script will take care of
-  indexing and searching:
-  \verbatim
-  local/kws_search.sh --cmd "$cmd" \
-    --max-states ${max_states} --min-lmwt ${min_lmwt} \
-    --max-lmwt ${max_lmwt} --skip-scoring $skip_scoring \
-    --indices-dir $decode_dir/kws_indices $lang_dir $data_dir $decode_dir
-  \endverbatim
+    Answer to example question
 
-  If your KWS data directory has an extra ID, e.g., oov (this is useful when you
-  have different KWS setups, in this case, your directory will look something
-  like data/dev10h.uem/kws_oov), you have to run it with the extraid option:
-  \verbatim
-  local/kws_search.sh --cmd "$cmd" --extraid $extraid  \
-    --max-states ${max_states} --min-lmwt ${min_lmwt} \
-    --max-lmwt ${max_lmwt} --skip-scoring $skip_scoring \
-    --indices-dir $decode_dir/kws_indices $lang_dir $data_dir $decode_dir
-  \endverbatim
 */
 }
diff --git a/src/doc/mainpage.dox b/src/doc/mainpage.dox
@@ -46,6 +46,7 @@
    - \subpage tutorial
    - \subpage kaldi_for_dummies
    - \subpage examples
+   - \subpage faq
    - \subpage glossary
    - \subpage data_prep
    - \subpage build_setup