kaldi-asr · danpovey · Dec 14, 2018 · Oct 8, 2018 · Oct 25, 2018 · Oct 25, 2018
diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh → ...ium/s5_r2/local/run_learn_lex_bayesian.sh b/egs/tedlium/s5_r2/local/run_learn_lex.sh → ...ium/s5_r2/local/run_learn_lex_bayesian.sh
@@ -2,7 +2,7 @@
 #
 # This script demonstrates a lexicon learning recipe, which aims to imrove
 # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
-# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh
 # for explanation of the options. 
 #
 # Copyright 2016  Xiaohui Zhang
@@ -78,7 +78,7 @@ fi
 
 # Learn a lexicon based on the acoustic training data and the reference lexicon.
 if [ $stage -le 1 ]; then
-  steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
+  steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
     --min-prob $min_prob --variants-prob-mass $variants_prob_mass \
     --variants-prob-mass-ref $variants_prob_mass_ref  \
     --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \

diff --git a/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+#
+# This script demonstrates a lexicon learning recipe, which aims to imrove
+# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh
+# for explanation of the options. 
+#
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+oov_symbol="<unk>"
+# The user may have an phonetisaurus-trained English g2p model ready.
+g2p_mdl_dir=
+# The dir which contains the reference lexicon (most probably hand-derived)
+# we want to expand/improve, and nonsilence_phones.txt,.etc which we need  
+# for building new dict dirs.
+ref_dict=data/local/dict
+# acoustic training data we use to get alternative
+# pronunciations and collet acoustic evidence.
+data=data/train
+# the cut-off parameter used to select pronunciation candidates from phone
+# decoding. We remove pronunciations with probabilities less than this value
+# after normalizing the probs s.t. the max-prob is 1.0 for each word."
+min_prob=0.1
+# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of
+# alpha, beta and delta. Basically, the three dimensions of alpha
+# and beta correspond to three pronunciation sources: phonetic-
+# decoding, G2P and the reference lexicon, and the larger a value is,
+# the more aggressive we'll prune pronunciations from that sooure.
+# The valid range of each dim. is [0, 1] (for alpha, and 0 means 
+# we never pruned pron from that source.) [0, 100] (for beta). 
+alpha="0.04,0.02,0"
+beta="30,5,0"
+# Floor value of the pronunciation posterior statistics.
+delta=0.00000001
+# This parameter determines how many pronunciations we keep for each word
+# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py
+# for details.
+vcr=16 
+
+# Intermediate outputs of the lexicon learning stage will be put into dir
+dir=exp/tri3_lex_greedy_work
+nj=35
+decode_nj=30
+stage=0
+lexlearn_stage=0
+affix="learned_greedy"
+
+. utils/parse_options.sh # accept options
+
+# The reference vocab is the list of words which we already have hand-derived pronunciations.
+ref_vocab=data/local/vocab.txt
+cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; 
+
+# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
+# in acoustic training data.
+if [ $stage -le 0 ]; then
+  if [ -z $g2p_mdl_dir ]; then
+    g2p_mdl_dir=exp/g2p_phonetisaurus
+    steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
+  fi
+  awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
+    $data/text | sort -u > $data/train_vocab.txt || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
+    $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
+  steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \
+    exp/g2p_phonetisaurus/lex_train || exit 1;
+fi
+
+# Learn a lexicon based on the acoustic training data and the reference lexicon.
+if [ $stage -le 1 ]; then
+  steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \
+    --alpha $alpha --beta $beta --delta $delta \
+    --min-prob $min_prob --cmd "$train_cmd" \
+    --variant-counts-ratio $vcr \
+    --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \
+    $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \
+    $dir || exit 1;
+fi
+
+# Add pronounciation probs to the learned lexicon.
+if [ $stage -le 2 ]; then
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1;
+
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1;
+
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1;
+
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1;
+
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1;
+fi
+
+# Re-decode
+if [ $stage -le 3 ]; then
+  ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\
+    echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible."
+  cp data/lang_nosp/G.fst data/lang_${affix}/
+  utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1;
+
+  for dset in dev test; do
+  (  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1;
+  ) &
+  done
+fi
+
+# RESULTS:
+# Baseline:
+# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
+# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
+
+# Re-decoding with the learned lexicon:
+# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys
+# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys
+
+# To see the effect to neural-net results, one should re-train NN with the learned lexicon.
+# Experiments have shown that, with the new lang dir, one should just re-run NN training
+# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should
+# expect improved overall WERs and word recognition performance on words whose pronunciations
+# were changed.
+
+exit
+wait
diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -220,6 +220,7 @@ if [ $stage -le 16 ]; then
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=10 \

diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
@@ -10,7 +10,7 @@
 def GetArgs():
     parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                      "to produce a learned lexicon.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                         help = "Input lexicon. Each line must be <word> <phones>.")

diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py
@@ -10,15 +10,16 @@
 import sys
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon"
-                                     "learning. The inputs are a file containing arc level information from lattice-align-words,"
-                                     "and a map which maps word-position-dependent phones to word-position-independent phones"
-                                     "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
-                                     "of pronunciations",
-                                     epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
-                                              "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
-                                              "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
-                                              "See steps/dict/learn_lexicon.sh for examples in detail.")
+    parser = argparse.ArgumentParser(
+        description = "Accumulate statistics from lattice-alignment outputs for lexicon"
+        "learning. The inputs are a file containing arc level information from lattice-align-words,"
+        "and a map which maps word-position-dependent phones to word-position-independent phones"
+        "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
+        "of pronunciations",
+        epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
+        "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
+        "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
+        "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")
 
     parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                         help = "Input file containing per arc statistics; "

diff --git a/egs/wsj/s5/steps/dict/internal/get_subsegments.py b/egs/wsj/s5/steps/dict/internal/get_subsegments.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Xiaohui Zhang
+# Apache 2.0.
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import argparse
+import sys
+import string
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "The purpose of this script is to use a ctm and a vocab file"
+        "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
+        "are all the strings of consecutive in-vocab words from the ctm"
+        "surrounded by an out-of-vocab word at each end if present.",
+        epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
+        "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
+        "exp/tri3_lex_0.4_work/resegmentation/text"
+        "See steps/dict/learn_lexicon_greedy.sh for an example.")
+
+    parser.add_argument("ctm", metavar='<ctm>', type = str,
+                        help = "Input ctm file."
+                        "each line must be <utt-id> <chanel> <start-time> <duration> <word>")
+    parser.add_argument("vocab", metavar='<vocab>', type = str,
+                        help = "Vocab file."
+                        "each line must be <word>")
+    parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
+                        help = "Subsegment file. Each line is in format:"
+                        "<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
+    parser.add_argument("text", metavar='<text>', type = str,
+                        help = "Text file. Each line is in format:"
+                        " <new-utt> <word1> <word2> ... <wordN>.")
+
+    print (' '.join(sys.argv), file = sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.ctm == "-":
+        args.ctm_handle = sys.stdin
+    else:
+        args.ctm_handle = open(args.ctm)
+
+    if args.vocab is not '':
+        if args.vocab == "-":
+            args.vocab_handle = sys.stdout
+        else:
+            args.vocab_handle = open(args.vocab)
+
+    args.subsegment_handle = open(args.subsegment, 'w')
+    args.text_handle = open(args.text, 'w')
+
+    return args
+
+def GetSubsegments(args, vocab):
+    sub_utt = list()
+    last_is_oov = False
+    is_oov = False
+    utt_id_last = None
+    start_times = {}
+    end_times = {}
+    sub_utts = {}
+    sub_utt_id = 1
+    sub_utt_id_last = 1
+    end_time_last = 0.0
+    for line in args.ctm_handle:
+        splits = line.strip().split()
+        if len(splits) < 5:
+            raise Exception("problematic line",line)
+
+        utt_id = splits[0]
+        start = float(splits[2])
+        dur = float(splits[3])
+        word = splits[4]
+        if utt_id != utt_id_last:
+            sub_utt_id = 1
+            if len(sub_utt)>1:
+                sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
+                end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
+            sub_utt = []
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+            is_oov_last = False
+        if word == '<eps>':
+            is_oov = True
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        elif word in vocab:
+            is_oov = True
+            sub_utt.append(word)
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        else:
+            is_oov = False
+            if is_oov_last == True:
+                sub_utt.append(word)
+                sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+                end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+                sub_utt_id += 1
+            sub_utt = [word]
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+        utt_id_last = utt_id
+        sub_utt_id_last = sub_utt_id
+        is_oov_last = is_oov
+        ent_time_last = start + dur
+
+    if is_oov:
+        if word != '<eps>':
+            sub_utt.append(word)
+        sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+        end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+
+    for utt,v in sorted(sub_utts.items()):
+        print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
+        print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)
+
+def ReadVocab(vocab_file_handle):
+    vocab = set()
+    if vocab_file_handle:
+        for line in vocab_file_handle.readlines():
+            splits = line.strip().split()
+            if len(splits) == 0:
+                continue
+            if len(splits) > 1:
+                raise Exception('Invalid format of line ' + line
+                                    + ' in vocab file.')
+            word = splits[0]
+            vocab.add(word)
+    return vocab
+
+def Main():
+    args = GetArgs()
+
+    vocab = ReadVocab(args.vocab_handle)
+    GetSubsegments(args, vocab)
+
+if __name__ == "__main__":
+    Main()