From 8dcfb6b615af0a2eb036c7e243634a280e21222d Mon Sep 17 00:00:00 2001 From: xiaohui-zhang Date: Mon, 8 Oct 2018 19:12:55 -0400 Subject: [PATCH 1/4] Added lexicon learning (adaptation) recipe for tedlium, in accordance with the IS17 paper. --- .../s5_r2/local/run_learn_lex_greedy.sh | 133 +++++ egs/wsj/s5/steps/dict/get_pron_stats.py | 19 +- .../s5/steps/dict/internal/get_subsegments.py | 140 +++++ .../dict/internal/prune_pron_candidates.py | 122 ++-- .../s5/steps/dict/internal/sum_arc_info.py | 136 +++++ egs/wsj/s5/steps/dict/learn_lexicon.sh | 43 +- egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh | 546 ++++++++++++++++++ .../s5/steps/dict/merge_learned_lexicons.py | 261 +++++++++ egs/wsj/s5/steps/dict/prons_to_lexicon.py | 35 +- egs/wsj/s5/steps/dict/select_prons_greedy.py | 376 ++++++++++++ 10 files changed, 1733 insertions(+), 78 deletions(-) create mode 100755 egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh create mode 100755 egs/wsj/s5/steps/dict/internal/get_subsegments.py create mode 100755 egs/wsj/s5/steps/dict/internal/sum_arc_info.py create mode 100755 egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh create mode 100755 egs/wsj/s5/steps/dict/merge_learned_lexicons.py create mode 100755 egs/wsj/s5/steps/dict/select_prons_greedy.py diff --git a/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh new file mode 100755 index 00000000000..f69af3fe360 --- /dev/null +++ b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh @@ -0,0 +1,133 @@ +#! /bin/bash +# +# This script demonstrates a lexicon learning recipe, which aims to imrove +# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes +# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh +# for explanation of the options. +# +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +. ./cmd.sh +. ./path.sh + +oov_symbol="" +# The user may have an phonetisaurus-trained English g2p model ready. +g2p_mdl_dir= +# The dir which contains the reference lexicon (most probably hand-derived) +# we want to expand/improve, and nonsilence_phones.txt,.etc which we need +# for building new dict dirs. +ref_dict=data/local/dict +# acoustic training data we use to get alternative +# pronunciations and collet acoustic evidence. +data=data/train +# the cut-off parameter used to select pronunciation candidates from phone +# decoding. We remove pronunciations with probabilities less than this value +# after normalizing the probs s.t. the max-prob is 1.0 for each word." +min_prob=0.1 +# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of +# alpha, beta and delta. Basically, the three dimensions of alpha +# and beta correspond to three pronunciation sources: phonetic- +# decoding, G2P and the reference lexicon, and the larger a value is, +# the more aggressive we'll prune pronunciations from that sooure. +# The valid range of each dim. is [0, 1] (for alpha, and 0 means +# we never pruned pron from that source.) [0, 100] (for beta). +alpha="0.04,0.02,0" +beta="30,5,0" +# Floor value of the pronunciation posterior statistics. +delta=0.00000001 +# This parameter determines how many pronunciations we keep for each word +# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py +# for details. +vcr=16 + +# Intermediate outputs of the lexicon learning stage will be put into dir +dir=exp/tri3_lex_greedy_work +nj=35 +decode_nj=30 +stage=0 +lexlearn_stage=0 +affix="learned_greedy" + +. utils/parse_options.sh # accept options + +# The reference vocab is the list of words which we already have hand-derived pronunciations. +ref_vocab=data/local/vocab.txt +cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; + +# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon) +# in acoustic training data. +if [ $stage -le 0 ]; then + if [ -z $g2p_mdl_dir ]; then + g2p_mdl_dir=exp/g2p_phonetisaurus + steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1; + fi + awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \ + $data/text | sort -u > $data/train_vocab.txt || exit 1; + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \ + $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1; + steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \ + exp/g2p_phonetisaurus/lex_train || exit 1; +fi + +# Learn a lexicon based on the acoustic training data and the reference lexicon. +if [ $stage -le 1 ]; then + steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \ + --alpha $alpha --beta $beta --delta $delta \ + --min-prob $min_prob --cmd "$train_cmd" \ + --variant-counts-ratio $vcr \ + --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \ + $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \ + $dir || exit 1; +fi + +# Add pronounciation probs to the learned lexicon. +if [ $stage -le 2 ]; then + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1; + + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1; + + steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1; + + utils/dict_dir_add_pronprobs.sh --max-normalize true \ + data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \ + exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \ + exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1; + + utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \ + data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1; +fi + +# Re-decode +if [ $stage -le 3 ]; then + ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\ + echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible." + cp data/lang_nosp/G.fst data/lang_${affix}/ + utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1; + + for dset in dev test; do + ( steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ + exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1; + ) & + done +fi + +# RESULTS: +# Baseline: +# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys +# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys + +# Re-decoding with the learned lexicon: +# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys +# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys + +# To see the effect to neural-net results, one should re-train NN with the learned lexicon. +# Experiments have shown that, with the new lang dir, one should just re-run NN training +# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should +# expect improved overall WERs and word recognition performance on words whose pronunciations +# were changed. + +exit +wait diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py index b5202a69abb..41866294723 100755 --- a/egs/wsj/s5/steps/dict/get_pron_stats.py +++ b/egs/wsj/s5/steps/dict/get_pron_stats.py @@ -10,15 +10,16 @@ import sys def GetArgs(): - parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon" - "learning. The inputs are a file containing arc level information from lattice-align-words," - "and a map which maps word-position-dependent phones to word-position-independent phones" - "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts" - "of pronunciations", - epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" - " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" - " exp/tri3_lex_0.4_work/lats/pron_stats.txt" - "See steps/dict/learn_lexicon.sh for examples in detail.") + parser = argparse.ArgumentParser( + description = "Accumulate statistics from lattice-alignment outputs for lexicon" + "learning. The inputs are a file containing arc level information from lattice-align-words," + "and a map which maps word-position-dependent phones to word-position-independent phones" + "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts" + "of pronunciations", + epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" + " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" + " exp/tri3_lex_0.4_work/lats/pron_stats.txt" + "See steps/dict/learn_lexicon.sh for examples in detail.") parser.add_argument("arc_info_file", metavar = "", type = str, help = "Input file containing per arc statistics; " diff --git a/egs/wsj/s5/steps/dict/internal/get_subsegments.py b/egs/wsj/s5/steps/dict/internal/get_subsegments.py new file mode 100755 index 00000000000..c431b4c7066 --- /dev/null +++ b/egs/wsj/s5/steps/dict/internal/get_subsegments.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +# we're using python 3.x style print but want it to work in python 2.x, +from __future__ import print_function +import argparse +import sys +import string + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "The purpose of this script is to use a ctm and a vocab file" + "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances" + "are all the strings of consecutive in-vocab words from the ctm" + "surrounded by an out-of-vocab word at each end if present.", + epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\" + "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\" + "exp/tri3_lex_0.4_work/resegmentation/text" + "See steps/dict/learn_lexicon_greedy.sh for an example.") + + parser.add_argument("ctm", metavar='', type = str, + help = "Input ctm file." + "each line must be ") + parser.add_argument("vocab", metavar='', type = str, + help = "Vocab file." + "each line must be ") + parser.add_argument("subsegment", metavar='', type = str, + help = "Subsegment file. Each line is in format:" + " ") + parser.add_argument("text", metavar='', type = str, + help = "Text file. Each line is in format:" + " ... .") + + print (' '.join(sys.argv), file = sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.ctm == "-": + args.ctm_handle = sys.stdin + else: + args.ctm_handle = open(args.ctm) + + if args.vocab is not '': + if args.vocab == "-": + args.vocab_handle = sys.stdout + else: + args.vocab_handle = open(args.vocab) + + args.subsegment_handle = open(args.subsegment, 'w') + args.text_handle = open(args.text, 'w') + + return args + +def GetSubsegments(args, vocab): + sub_utt = list() + last_is_oov = False + is_oov = False + utt_id_last = None + start_times = {} + end_times = {} + sub_utts = {} + sub_utt_id = 1 + sub_utt_id_last = 1 + end_time_last = 0.0 + for line in args.ctm_handle: + splits = line.strip().split() + if len(splits) < 5: + raise Exception("problematic line",line) + + utt_id = splits[0] + start = float(splits[2]) + dur = float(splits[3]) + word = splits[4] + if utt_id != utt_id_last: + sub_utt_id = 1 + if len(sub_utt)>1: + sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt) + end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last + sub_utt = [] + start_times[utt_id+'-'+str(sub_utt_id)] = start + is_oov_last = False + if word == '': + is_oov = True + end_times[utt_id+'-'+str(sub_utt_id)] = start + dur + elif word in vocab: + is_oov = True + sub_utt.append(word) + end_times[utt_id+'-'+str(sub_utt_id)] = start + dur + else: + is_oov = False + if is_oov_last == True: + sub_utt.append(word) + sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt) + end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur + sub_utt_id += 1 + sub_utt = [word] + start_times[utt_id+'-'+str(sub_utt_id)] = start + utt_id_last = utt_id + sub_utt_id_last = sub_utt_id + is_oov_last = is_oov + ent_time_last = start + dur + + if is_oov: + if word != '': + sub_utt.append(word) + sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt) + end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur + + for utt,v in sorted(sub_utts.items()): + print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle) + print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle) + +def ReadVocab(vocab_file_handle): + vocab = set() + if vocab_file_handle: + for line in vocab_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) > 1: + raise Exception('Invalid format of line ' + line + + ' in vocab file.') + word = splits[0] + vocab.add(word) + return vocab + +def Main(): + args = GetArgs() + + vocab = ReadVocab(args.vocab_handle) + GetSubsegments(args, vocab) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py index 1f2863424f3..60c7f75bbe8 100755 --- a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py +++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -# Copyright 2016 Xiaohui Zhang +# Copyright 2018 Xiaohui Zhang # Apache 2.0. from __future__ import print_function @@ -10,27 +10,36 @@ import math def GetArgs(): - parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" - "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" - "cadidates according to their soft-counts, and then select the top r * N candidates" - "(For words in the reference lexicon, N = # pron variants given by the reference" - "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." - "r is a user-specified constant, like 2.", - epilog = "See steps/dict/learn_lexicon.sh for example") - - parser.add_argument("--r", type = float, default = "2.0", - help = "a user-specified ratio parameter which determines how many" - "pronunciation candidates we want to keep for each word.") + parser = argparse.ArgumentParser( + description = "Prune pronunciation candidates based on soft-counts from lattice-alignment" + "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation" + "cadidates according to their soft-counts, and then select the top variant-counts-ratio * N candidates" + "(For words in the reference lexicon, N = # pron variants given by the reference" + "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon).", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") + + parser.add_argument("--variant-counts-ratio", type = float, default = "3.0", + help = "A user-specified ratio parameter which determines how many" + "pronunciation candidates we want to keep for each word at most.") parser.add_argument("pron_stats", metavar = "", type = str, - help = "File containing soft-counts of all pronounciation candidates; " + help = "File containing soft-counts of pronounciation candidates; " "each line must be ") + parser.add_argument("lexicon_phonetic_decoding", metavar = "", type = str, + help = "Lexicon containing pronunciation candidates from phonetic decoding." + "each line must be ") + parser.add_argument("lexiconp_g2p", metavar = "", type = str, + help = "Lexicon with probabilities for pronunciation candidates from G2P." + "each line must be ") parser.add_argument("ref_lexicon", metavar = "", type = str, help = "Reference lexicon file, where we obtain # pron variants for" "each word, based on which we prune the pron candidates." "Each line must be ") - parser.add_argument("pruned_prons", metavar = "", type = str, - help = "An output file in lexicon format, which contains prons we want to" - "prune off from the pron_stats file.") + parser.add_argument("lexicon_phonetic_decoding_pruned", metavar = "", type = str, + help = "Output lexicon containing pronunciation candidates from phonetic decoding after pruning." + "each line must be ") + parser.add_argument("lexicon_g2p_pruned", metavar = "", type = str, + help = "Output lexicon containing pronunciation candidates from G2P after pruning." + "each line must be ") print (' '.join(sys.argv), file=sys.stderr) @@ -40,12 +49,13 @@ def GetArgs(): return args def CheckArgs(args): + print(args) args.pron_stats_handle = open(args.pron_stats) + args.lexicon_phonetic_decoding_handle = open(args.lexicon_phonetic_decoding) + args.lexiconp_g2p_handle = open(args.lexiconp_g2p) args.ref_lexicon_handle = open(args.ref_lexicon) - if args.pruned_prons == "-": - args.pruned_prons_handle = sys.stdout - else: - args.pruned_prons_handle = open(args.pruned_prons, "w") + args.lexicon_phonetic_decoding_pruned_handle = open(args.lexicon_phonetic_decoding_pruned, "w") + args.lexicon_g2p_pruned_handle = open(args.lexicon_g2p_pruned, "w") return args def ReadStats(pron_stats_handle): @@ -62,13 +72,11 @@ def ReadStats(pron_stats_handle): phones = ' '.join(splits[2:]) stats[word].append((phones, count)) - for word, entry in stats.iteritems(): - entry.sort(key=lambda x: x[1]) return stats -def ReadLexicon(ref_lexicon_handle): - ref_lexicon = defaultdict(set) - for line in ref_lexicon_handle.readlines(): +def ReadLexicon(lexicon_handle): + lexicon = defaultdict(set) + for line in lexicon_handle.readlines(): splits = line.strip().split() if len(splits) == 0: continue @@ -77,42 +85,74 @@ def ReadLexicon(ref_lexicon_handle): + ' in lexicon file.') word = splits[0] phones = ' '.join(splits[1:]) - ref_lexicon[word].add(phones) - return ref_lexicon + lexicon[word].add(phones) + return lexicon -def PruneProns(args, stats, ref_lexicon): +def ReadLexiconp(lexiconp_handle): + lexicon = defaultdict(set) + pron_probs = defaultdict(float) + for line in lexiconp_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 3: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[1] + prob = float(splits[0]) + phones = ' '.join(splits[2:]) + pron_probs[(word, phones)] = prob + lexicon[word].add(phones) + return lexicon, pron_probs + +def PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs): + # For those pron candidates from lexicon_phonetic_decoding/g2p which don't + # have stats, we append them to the "stats" dict, with a zero count. + for word, entry in stats.iteritems(): + prons_with_stats = set() + for (pron, count) in entry: + prons_with_stats.add(pron) + for pron in lexicon_g2p[word]: + if pron not in prons_with_stats: + entry.append((pron, lexicon_g2p_probs[(word, pron)]-1.0)) + entry.sort(key=lambda x: x[1]) + # Compute the average # pron variants counts per word in the reference lexicon. num_words_ref = 0 num_prons_ref = 0 for word, prons in ref_lexicon.iteritems(): num_words_ref += 1 num_prons_ref += len(prons) - avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref)) - + avg_variant_counts_ref = round(float(num_prons_ref) / float(num_words_ref)) for word, entry in stats.iteritems(): if word in ref_lexicon: - variants_counts = args.r * len(ref_lexicon[word]) + variant_counts = args.variant_counts_ratio * len(ref_lexicon[word]) else: - variants_counts = args.r * avg_variants_counts_ref + variant_counts = args.variant_counts_ratio * avg_variant_counts_ref num_variants = 0 - while num_variants < variants_counts: + count = 0.0 + while num_variants < variant_counts: try: - pron, prob = entry.pop() - if word not in ref_lexicon or pron not in ref_lexicon[word]: + pron, count = entry.pop() + if word in ref_lexicon and pron in ref_lexicon[word]: + continue + if pron in lexicon_phonetic_decoding[word]: + num_variants += 1 + print('{0} {1}'.format(word, pron), file=args.lexicon_phonetic_decoding_pruned_handle) + if pron in lexicon_g2p[word]: num_variants += 1 + print('{0} {1}'.format(word, pron), file=args.lexicon_g2p_pruned_handle) except IndexError: break - - for word, entry in stats.iteritems(): - for pron, prob in entry: - if word not in ref_lexicon or pron not in ref_lexicon[word]: - print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle) def Main(): args = GetArgs() ref_lexicon = ReadLexicon(args.ref_lexicon_handle) + lexicon_phonetic_decoding = ReadLexicon(args.lexicon_phonetic_decoding_handle) + lexicon_g2p, lexicon_g2p_probs = ReadLexiconp(args.lexiconp_g2p_handle) stats = ReadStats(args.pron_stats_handle) - PruneProns(args, stats, ref_lexicon) + + PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs) if __name__ == "__main__": Main() diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py new file mode 100755 index 00000000000..d3913ec954f --- /dev/null +++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys + +class StrToBoolAction(argparse.Action): + """ A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False """ + def __call__(self, parser, namespace, values, option_string=None): + if values == "true": + setattr(namespace, self.dest, True) + elif values == "false": + setattr(namespace, self.dest, False) + else: + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) + + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Accumulate statistics from per arc lattice statitics" + "for lexicon learning", + epilog = "See steps/dict/learn_lexicon.sh for example") + + parser.add_argument("--set-sum-to-one", type = str, default = True, + action = StrToBoolAction, choices = ["true", "false"], + help = "If normalize posteriors such that the sum of " + "pronunciation posteriors of a word in an utterance is 1.") + parser.add_argument("arc_info_file", metavar = "", type = str, + help = "File containing per arc statistics; " + "each line must be " + "") + parser.add_argument("phone_map", metavar = "", type = str, + help = "An input phone map used to remove word boundary markers from phones;" + "generated in steps/cleanup/debug_lexicon.sh") + parser.add_argument("stats_file", metavar = "", type = str, + help = "Write accumulated statitistics to this file" + "each line is " + "") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.arc_info_file == "-": + args.arc_info_file_handle = sys.stdin + else: + args.arc_info_file_handle = open(args.arc_info_file) + + args.phone_map_handle = open(args.phone_map) + + if args.stats_file == "-": + args.stats_file_handle = sys.stdout + else: + args.stats_file_handle = open(args.stats_file, "w") + + return args + +def Main(): + args = GetArgs() + + lexicon = defaultdict(list) + prons = defaultdict(list) + start_frames = {} + stats = defaultdict(lambda : defaultdict(float)) + sum_tot = defaultdict(float) + + phone_map = {} + for line in args.phone_map_handle.readlines(): + splits = line.strip().split() + phone_map[splits[0]] = splits[1] + + for line in args.arc_info_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 6): + raise Exception('Invalid format of line ' + line + + ' in ' + args.arc_info_file) + + utt = splits[0] + start_frame = int(splits[1]) + word = splits[4] + count = float(splits[3]) + phones_unmapped = splits[5:] + phones = [phone_map[phone] for phone in phones_unmapped] + phones = ' '.join(phones) + overlap = False + if word == '': + continue + if (word, utt) not in start_frames: + start_frames[(word, utt)] = start_frame + + if (word, utt) in stats: + stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count + else: + stats[(word, utt)][phones] = count + sum_tot[(word, utt)] += count + + if phones not in prons[word]: + prons[word].append(phones) + + for (word, utt) in stats: + count_sum = 0.0 + counts = dict() + for phones in stats[(word, utt)]: + count = stats[(word, utt)][phones] + count_sum += count + counts[phones] = count + # By default we normalize the pron posteriors of each word in each utterance, + # so that they sum up exactly to one. If a word occurs two times in a utterance, + # the effect of this operation is to average the posteriors of these two occurences + # so that there's only one "equivalent occurence" of this word in the utterance. + # However, this case should be extremely rare if the utterances are already + # short sub-utterances produced by steps/dict/internal/get_subsegments.py + for phones in stats[(word, utt)]: + count = counts[phones] / count_sum + print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle) + # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance: + # if count_sum < 0.9 or count_sum > 1.1: + # print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr) + + args.stats_file_handle.close() + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon.sh index a719422b593..ae9681ebab3 100755 --- a/egs/wsj/s5/steps/dict/learn_lexicon.sh +++ b/egs/wsj/s5/steps/dict/learn_lexicon.sh @@ -28,7 +28,7 @@ # Begin configuration section. -cmd=run.pl +cmd=queue.pl nj=4 stage=0 @@ -36,6 +36,7 @@ oov_symbol= lexicon_g2p= min_prob=0.3 +variant_counts_ratio=8 variants_prob_mass=0.7 variants_prob_mass_ref=0.9 @@ -93,6 +94,10 @@ if [ $# -lt 6 ] || [ $# -gt 7 ]; then echo " --min-prob # The cut-off parameter used to select pronunciation candidates from phonetic" echo " # decoding. We remove pronunciations with probabilities less than this value" echo " # after normalizing the probs s.t. the max-prob is 1.0 for each word." + echo " --variant-counts-ratio # This ratio parameter determines the maximum number of pronunciation" + echo " # candidates we will keep for each word, after pruning according to lattice statistics from" + echo " # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py" + echo " # for details." echo " --prior-mean # Mean of priors (summing up to 1) assigned to three exclusive pronunciation" echo " # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian" echo " # pronunciation selection procedure). We recommend setting a larger prior" @@ -150,17 +155,17 @@ if [ $stage -le 0 ]; then # Remove non-scored-words from the reference lexicon. awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ - $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt + $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ $target_vocab | sort | uniq > $dir/target_vocab.txt # From the reference lexicon, we estimate the target_num_prons_per_word as, - # ceiling(avg. # prons per word in the reference lexicon). This'll be used as + # round(avg. # prons per word in the reference lexicon). This'll be used as # the upper bound of # pron variants per word when we apply G2P or select prons to # construct the learned lexicon in later stages. - python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \ + python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \ `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \ > $dir/target_num_prons_per_word || exit 1; @@ -225,10 +230,11 @@ if [ $stage -le 2 ]; then # Get the oov words list (w.r.t ref vocab) which are in training data. awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \ - $dir/train_counts.txt | sort > $dir/oov_train.txt + $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \ + $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \ - $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate + $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1; echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:" cat $dir/train_oov_rate @@ -237,14 +243,14 @@ if [ $stage -le 2 ]; then # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on. awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \ - $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt + $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt || exit 1; # Get the pronunciation of oov_symbol. - oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '` + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` # For oov words in training data for which we don't even have G2P pron candidates, # we simply assign them the pronunciation of the oov symbol (like ). awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \ - $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt + $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1; cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \ awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ @@ -263,7 +269,7 @@ if [ $stage -le 3 ]; then # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob", # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon. - cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt + cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \ --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \ @@ -295,7 +301,7 @@ if [ $stage -le 4 ]; then # Generate lattices for the acoustic training data with the combined lexicon. if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi - steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \ + steps/align_fmllr_lats.sh --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1; # Get arc level information from the lattice. @@ -321,13 +327,10 @@ if [ $stage -le 5 ]; then rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment. - $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt - - awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \ - > $dir/lexicon_phonetic_decoding_pruned.txt - - awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \ - > $dir/lexicon_g2p_pruned.txt \ + $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \ + --variant-counts-ratio $variant_counts_ratio \ + $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \ + $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt # Filter out words which don't appear in the acoustic training data cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \ @@ -402,7 +405,7 @@ if [ $stage -le 7 ]; then # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any. cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \ awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \ - $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt + $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1; awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \ $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt @@ -426,5 +429,5 @@ if [ $stage -le 8 ]; then echo " ... sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon." cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \ - sort | uniq > $dest_dict/lexicon.txt + sort | uniq > $dest_dict/lexicon.txt || exit 1; fi diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh new file mode 100755 index 00000000000..83aa98c1700 --- /dev/null +++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh @@ -0,0 +1,546 @@ +#! /bin/bash + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0 + +# This recipe has similar inputs and outputs as steps/dict/learn_lexicon.sh +# The major difference is, instead of using a Bayesian framework for +# pronunciation selection, we used a likelihood-reduction based greedy +# pronunciation selection framework presented in the paper: +# "Acoustic data-driven lexicon learning based on a greedy pronunciation " +# "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur," +# "Interspeech 2017." + +# This script demonstrate how to expand a existing lexicon using a combination +# of acoustic evidence and G2P to learn a lexicon that covers words in a target +# vocab, and agrees sufficiently with the acoustics. The basic idea is to +# run phonetic decoding on acoustic training data using an existing +# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get +# alternative pronunciations for words in training data. Then we combine three +# exclusive sources of pronunciations: the reference lexicon (supposedly +# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run +# lattice alignment on the same data, to collect acoustic evidence (soft +# counts) of all pronunciations. Based on these statistics, we use a greedy +# framework (see steps/dict/select_prons_greedy.sh for details) to select an +# informative subset of pronunciations for each word with acoustic evidence. +# two important parameters are alpha and beta. Basically, the three dimensions of alpha +# and beta correspond to three pronunciation sources: phonetic-decoding, G2P and +# the reference lexicon, and the larger a value is, the more aggressive we'll +# prune pronunciations from that sooure. The valid range of each dim. is [0, 1] +# (for alpha, and 0 means we never pruned pron from that source.) [0, 100] (for beta). +# The output of steps/dict/select_prons_greedy.sh is a learned lexicon whose vocab +# matches the user-specified target-vocab, and two intermediate outputs which were +# used to generate the learned lexicon: an edits file which records the recommended +# changes to all in-ref-vocab words' prons, and a half-learned lexicon +# ($dest_dict/lexicon0.txt) where all in-ref-vocab words' prons were untouched +# (on top of which we apply the edits file to produce the final learned lexicon). +# The user can always modify the edits file manually and then re-apply it on the +# half-learned lexicon using steps/dict/apply_lexicon_edits.sh to produce the +# final learned lexicon. See the last stage in this script for details. + +stage=0 +# Begin configuration section. +cmd=queue.pl +nj= +stage=0 +oov_symbol= +lexiconp_g2p= +min_prob=0.3 +variant_counts_ratio=8 +variant_counts_no_acoustics=1 +alpha="0,0,0" +beta="0,0,0" +delta=0.0000001 +num_gauss= +num_leaves= +retrain_src_mdl=true +cleanup=true +nj_select_prons=200 +learn_iv_prons=false # whether we want to learn the prons of IV words (w.r.t. ref_vocab), + +# End configuration section. + +. ./path.sh +. utils/parse_options.sh + +if [ $# -lt 6 ] || [ $# -gt 7 ]; then + echo "Usage: $0 [options] \\" + echo " ." + echo " This script does lexicon expansion using a combination of acoustic" + echo " evidence and G2P to produce a lexicon that covers words of a target vocab:" + echo "" + echo "Arguments:" + echo " The dir which contains the reference lexicon (most probably hand-derived)" + echo " we want to expand/improve, and nonsilence_phones.txt,.etc which we need " + echo " for building new dict dirs." + echo " The vocabulary we want the final learned lexicon to cover (one word per line)." + echo " acoustic training data we use to get alternative" + echo " pronunciations and collet acoustic evidence." + echo " The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" + echo " using G2P expanded lexicon) to do phonetic decoding (to get alternative" + echo " pronunciations) and lattice-alignment (to collect acoustic evidence for" + echo " evaluating all prounciations)" + echo " The reference lang dir which we use to get non-scored-words" + echo " like for building new dict dirs" + echo " The dict dir where we put the final learned lexicon, whose vocab" + echo " matches ." + echo " The dir which contains all the intermediate outputs of this script." + echo "" + echo "Note: and the vocab of don't have to match. For words" + echo " who are in but not seen in , their pronunciations" + echo " will be given by G2P at the end." + echo "" + echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\" + echo " exp/tri3 data/lang data/local/dict_learned" + echo "Options:" + echo " --stage # stage to run from, to enable resuming from partially" + echo " # completed run (default: 0)" + echo " --cmd '$cmd' # command to submit jobs with (e.g. run.pl, queue.pl)" + echo " --nj # number of parallel jobs" + echo " --oov-symbol '$oov_symbol' # oov symbol, like ." + echo " --lexiconp-g2p # a lexicon (with prob in the second column) file containing g2p generated" + echo " # pronunciations, for words in acoustic training data / target vocabulary. It's optional." + echo " --min-prob # The cut-off parameter used to select pronunciation candidates from phonetic" + echo " # decoding. We remove pronunciations with probabilities less than this value" + echo " # after normalizing the probs s.t. the max-prob is 1.0 for each word." + echo " --variant-counts-ratio # This ratio parameter determines the maximum number of pronunciation" + echo " # candidates we will keep for each word, after pruning according to lattice statistics from" + echo " # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py" + echo " # for details." + echo " --variant-counts-no-acoustics # how many g2p-prons per word we want to include for each words unseen in acoustic training data." + echo " --alpha ,, # scaling factors used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --beta ,, # smoothing factors used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --delta # a floor value used in the greedy pronunciation selection framework, " + echo " # see steps/dict/select_prons_greedy.py for details." + echo " --num-gauss # number of gaussians for the re-trained SAT model (on top of )." + echo " --num-leaves # number of leaves for the re-trained SAT model (on top of )." + echo " --retrain-src-mdl # true if you want to re-train the src_mdl before phone decoding (default false)." + exit 1 +fi + +echo "$0 $@" # Print the command line for logging + +ref_dict=$1 +target_vocab=$2 +data=$3 +src_mdl_dir=$4 +ref_lang=$5 +dest_dict=$6 + +if [ -z "$oov_symbol" ]; then + echo "$0: the --oov-symbol option is required." + exit 1 +fi + +if [ $# -gt 6 ]; then + dir=$7 # Most intermediate outputs will be put here. +else + dir=${src_mdl_dir}_lex_learn_work +fi + +mkdir -p $dir +if [ $stage -le 0 ]; then + echo "$0: Some preparatory work." + # Get the word counts of training data. + awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \ + $data/text | sort > $dir/train_counts.txt + + # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab. + steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt > $dir/non_scored_entries + + # Remove non-scored-words from the reference lexicon. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt + + cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \ + $target_vocab | sort | uniq > $dir/target_vocab.txt + + # From the reference lexicon, we estimate the target_num_prons_per_word as, + # round(avg. # prons per word in the reference lexicon). This'll be used as + # the upper bound of # pron variants per word when we apply G2P or select prons to + # construct the learned lexicon in later stages. + python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \ + `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \ + > $dir/target_num_prons_per_word || exit 1; + + if [ -z $lexiconp_g2p ]; then + # create an empty list of g2p generated prons, if it's not given. + touch $dir/lexicon_g2p.txt + touch $dir/lexiconp_g2p.txt + else + # Exchange the 1st column (word) and 2nd column (prob) and remove pronunciations + # which are already in the reference lexicon. + cat $lexiconp_g2p | awk '{a=$1;b=$2; $1="";$2="";print b" "a$0}' | \ + awk 'NR==FNR{a[$0] = 1; next} {w=$2;for (n=3;n<=NF;n++) w=w" "$n; if(!(w in a)) print $0}' \ + $dir/ref_lexicon.txt - > $dir/lexiconp_g2p.txt 2>/dev/null + + # make a copy where we remove the first column (probabilities). + cat $dir/lexiconp_g2p.txt | cut -f1,3- > $dir/lexicon_g2p.txt 2>/dev/null + fi + variant_counts=`cat $dir/target_num_prons_per_word` || exit 1; + $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \ + --top-N=$variant_counts $dir/lexiconp_g2p.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1; +fi + +if [ $stage -le 1 ] && $retrain_src_mdl; then + echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then" + echo " ... re-train the source acoustic model for phonetic decoding. " + mkdir -p $dir/dict_expanded_target_vocab + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_target_vocab 2>/dev/null + rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in the target vocab. + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt + + # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which + # cannot be found in lexicon_g2p.txt, we simply ignore them. + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \ + $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt + + cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \ + cat $dir/non_scored_entries - | + sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \ + $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1; + + # Align the acoustic training data using the given src_mdl_dir. + alidir=${src_mdl_dir}_ali_$(basename $data) + steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ + $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1; + + # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained + # this model will be used for phonetic decoding and lattice alignment later on. + if [ -z $num_leaves ] || [ -z $num_gauss ] ; then + echo "num_leaves and num_gauss need to be specified." && exit 1; + fi + steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \ + $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: Expand the reference lexicon to cover all words seen in," + echo " ... acoustic training data, and prepare corresponding dict and lang directories." + echo " ... This is needed when generate pron candidates from phonetic decoding." + mkdir -p $dir/dict_expanded_train + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_expanded_train 2>/dev/null + rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null + + # Get the oov words list (w.r.t ref vocab) which are in training data. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \ + $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \ + $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; + + awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \ + $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1; + + echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:" + cat $dir/train_oov_rate + + # Assign pronunciations from lexicon_g2p to oov_train. For words which + # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton + # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on. + variant_counts=`cat $dir/target_num_prons_per_word` || exit 1; + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_train.txt || exit 1; + + # Get the pronunciation of oov_symbol. + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` + # For oov words in training data for which we don't even have G2P pron candidates, + # we simply assign them the pronunciation of the oov symbol (like ), + # so that we can get pronunciations for them from phonetic decoding. + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \ + $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1; + + cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat - $dir/non_scored_entries | \ + sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1; + + utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \ + $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1; +fi + +if [ $stage -le 3 ]; then + echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.." + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/cleanup/debug_lexicon.sh --nj $nj \ + --cmd "$decode_cmd" $data $dir/lang_expanded_train \ + $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1; +fi + +if [ $stage -le 4 ]; then + echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one" + echo " ... lexicon, and run lattice alignment using this lexicon on acoustic training data" + echo " ... to collect acoustic evidence." + # We first prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob", + # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon. + cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt + + $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \ + --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \ + $dir/phonetic_decoding/prons.txt $dir/lexicon_pd_with_eps.txt + + # We abandon phonetic-decoding candidates for infrequent words. + awk '{if($2 < 3) print $1}' $dir/train_counts.txt > $dir/pd_candidates_to_exclude.txt + awk 'NR==FNR{a[$1] = $2; next} {if(a[$1]<10) print $1}' $dir/train_counts.txt \ + $dir/oov_train_no_pron.txt >> $dir/pd_candidates_to_exclude.txt + + if [ -s $dir/pd_candidates_to_exclude.txt ]; then + cat $dir/lexicon_pd_with_eps.txt | grep -vP "|||\[.*\]" | \ + awk 'NR==FNR{a[$0] = 1; next} {if(!($1 in a)) print $0}' $dir/pd_candidates_to_exclude.txt - | \ + sort | uniq > $dir/lexicon_pd.txt || exit 1; + else + cat $dir/lexicon_pd_with_eps.txt | grep -vP "|||\[.*\]" | \ + sort | uniq > $dir/lexicon_pd.txt || exit 1; + fi + + # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon. + mkdir -p $dir/dict_combined_iter1 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter1/ 2>/dev/null + rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null + + # Filter out words which don't appear in the acoustic training data + cat $dir/lexicon_pd.txt $dir/lexicon_g2p.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter1/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter1 $oov_symbol \ + $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1; + + # Generate lattices for the acoustic training data with the combined lexicon. + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + + # Get the vocab for words for which we want to learn pronunciations. + if $learn_iv_prons; then + # If we want to learn the prons of IV words (w.r.t. ref_vocab), the learn_vocab is just the intersection of + # target_vocab and the vocab of words seen in acoustic training data (first col. of train_counts.txt) + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt \ + > $dir/learn_vocab.txt + else + # Exclude words from the ref_vocab if we don't want to learn the pronunciations of IV words. + awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt | \ + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_vocab.txt - > $dir/learn_vocab.txt + fi + + # In order to get finer lattice stats of alternative prons, we want to make lattices deeper. + # To speed up lattice generation, we use a ctm to create sub-utterances and a sub-segmentation + # for each instance of a word within learn_vocab (or a string of consecutive words within learn_vocab), + # including a single out-of-learn-vocab word at the boundary if present. + mkdir -p $dir/resegmentation + steps/dict/internal/get_subsegments.py $dir/phonetic_decoding/word.ctm $dir/learn_vocab.txt \ + $dir/resegmentation/subsegments $dir/resegmentation/text || exit 1; + utils/data/subsegment_data_dir.sh $data $dir/resegmentation/subsegments $dir/resegmentation/text \ + $dir/resegmentation/data || exit 1; + steps/compute_cmvn_stats.sh $dir/resegmentation/data || exit 1; + + steps/align_fmllr_lats.sh --beam 20 --retry-beam 50 --final-beam 30 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ + $dir/resegmentation/data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1; + + # Get arc level information from the lattice. + $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \ + $dir/lats_iter1/final.mdl \ + "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \ + $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1; + + # Compute soft counts (pron_stats) of every particular word-pronunciation pair by + # summing up arc level information over all utterances. We'll use this to prune + # pronunciation candidates before the next iteration of lattice generation. + cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1; + + # Accumlate utterance-level pronunciation posteriors (into arc_stats) by summing up + # posteriors of arcs representing the same word & pronunciation and starting + # from roughly the same location. See steps/dict/internal/sum_arc_info.py for details. + for i in `seq 1 $nj`;do + cat $dir/lats_iter1/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \ + steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/arc_info_summed.${i}.txt + done + cat $dir/lats_iter1/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter1/arc_stats.txt + + # Prune the phonetic_decoding lexicon so that any pronunciation that only has non-zero posterior at one word example will be removed. + # The pruned lexicon is put in $dir/lats_iter1. After further pruning in the next stage it'll be put back to $dir. + awk 'NR==FNR{w=$1;for (n=5;n<=NF;n++) w=w" "$n;a[w]+=1;next} {if($0 in a && a[$0]>1) print $0}' \ + $dir/lats_iter1/arc_stats.txt $dir/lexicon_pd.txt > $dir/lats_iter1/lexicon_pd_pruned.txt +fi + +# Here we re-generate lattices (with a wider beam and a pruned combined lexicon) and re-collect pronunciation statistics +if [ $stage -le 5 ]; then + echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment." + mkdir -p $dir/dict_combined_iter2 + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dir/dict_combined_iter2/ 2>/dev/null + rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null + + # Prune away pronunciations which have low acoustic evidence from the first pass of lattice generation. + $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \ + --variant-counts-ratio $variant_counts_ratio \ + $dir/lats_iter1/pron_stats.txt $dir/lats_iter1/lexicon_pd_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \ + $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt + + # Filter out words which don't appear in the acoustic training data. + cat $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt \ + $dir/ref_lexicon.txt | tr -s '\t' ' ' | \ + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \ + cat $dir/non_scored_entries - | \ + sort | uniq > $dir/dict_combined_iter2/lexicon.txt + + utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \ + $dir/dict_combined_iter2 $oov_symbol \ + $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1; + + # Re-generate lattices with a wider beam, so that we'll get deeper lattices. + if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi + steps/align_fmllr_lats.sh --beam 30 --retry-beam 60 --final-beam 50 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \ + $dir/resegmentation/data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1; + + # Get arc level information from the lattice as we did in the last stage. + $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \ + lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \ + $dir/lats_iter2/final.mdl \ + "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \ + lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \ + utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \ + utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \ + $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1; + + # Compute soft counts (pron_stats) of every particular word-pronunciation pair as + # we did in the last stage. The stats will only be used as diagnostics. + cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \ + $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1; + + # Accumlate utterance-level pronunciation posteriors as we did in the last stage. + for i in `seq 1 $nj`;do + cat $dir/lats_iter2/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \ + steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/arc_info_summed.${i}.txt + done + cat $dir/lats_iter2/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter2/arc_stats.txt + + # The pron_stats are the acoustic evidence which the likelihood-reduction-based pronunciation + # selection procedure will be based on. + # Split the utterance-level pronunciation posterior stats into $nj_select_prons pieces, + # so that the following pronunciation selection stage can be parallelized. + numsplit=$nj_select_prons + awk '{print $1"-"$2" "$1}' $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/utt2word + utt2words=$(for n in `seq $numsplit`; do echo $dir/lats_iter2/utt2word.$n; done) + utils/split_scp.pl --utt2spk=$dir/lats_iter2/utt2word $dir/lats_iter2/utt2word $utt2words || exit 1 + for n in `seq $numsplit`; do + (cat $dir/lats_iter2/utt2word.$n | awk '{$1=substr($1,length($2)+2);print $2" "$1}' - > $dir/lats_iter2/word2utt.$n + awk 'NR==FNR{a[$0] = 1; next} {b=$1" "$2; if(b in a) print $0}' $dir/lats_iter2/word2utt.$n \ + $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/arc_stats.${n}.txt + ) & + done + wait +fi + +if [ $stage -le 6 ]; then + echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment." + # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations + # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding. + # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt + # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt. + # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided). + # For words in the ref. vocab, we instead output a human readable & editable "edits" file called + # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a + # summary is printed into the log file. + + $cmd JOB=1:$nj_select_prons $dir/lats_iter2/log/generate_learned_lexicon.JOB.log \ + steps/dict/select_prons_greedy.py \ + --alpha=${alpha} --beta=${beta} \ + --delta=${delta} \ + $ref_dict/silence_phones.txt $dir/lats_iter2/arc_stats.JOB.txt $dir/train_counts.txt $dir/ref_lexicon.txt \ + $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \ + $dir/lats_iter2/learned_lexicon.JOB.txt || exit 1; + + cat $dir/lats_iter2/learned_lexicon.*.txt > $dir/lats_iter2/learned_lexicon.txt + rm $dir/lats_iter2/learned_lexicon.*.txt + + $cmd $dir/lats_iter2/log/lexicon_learning_summary.log \ + steps/dict/merge_learned_lexicons.py \ + $dir/lats_iter2/arc_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \ + $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \ + $dir/lats_iter2/learned_lexicon.txt \ + $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt || exit 1; + + cp $dir/lats_iter2/ref_lexicon_edits.txt $dir/lats_iter2/ref_lexicon_edits.txt + # Remove some stuff that takes up space and is unlikely to be useful later on. + if $cleanup; then + rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null + fi +fi + +if [ $stage -le 7 ]; then + echo "$0: Expand the learned lexicon further to cover words in target vocab that are." + echo " ... not seen in acoustic training data." + mkdir -p $dest_dict + cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \ + $dest_dict 2>/dev/null + rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null + # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the + # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any. + cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \ + awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \ + $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1; + + variant_counts=$variant_counts_no_acoustics + + $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \ + --top-N=$variant_counts $dir/lexiconp_g2p.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1; + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \ + $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_no_acoustics.txt|| exit 1; + + # Get the pronunciation of oov_symbol. + oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` || exit 1; + # For oov words in target_vocab for which we don't even have G2P pron candidates, + # we simply assign them the pronunciation of the oov symbol (like ), + if [ -s $dir/g2p_prons_for_oov_no_acoustics.txt ]; then + awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_no_acoustics.txt \ + $dir/oov_no_acoustics.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_target_vocab_no_pron.txt || exit 1; + else + awk -v op="$oov_pron" '{print $0" "op}' $dir/oov_no_acoustics.txt > $dir/oov_target_vocab_no_pron.txt || exit 1 + fi + + # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics, + # learned lexicon for oov words with acoustics, and the original reference lexicon (for + # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py + cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \ + $dir/oov_target_vocab_no_pron.txt $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp + + awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \ + $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil + + cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt +fi + +if [ $stage -le 8 ]; then + echo "$0: Apply the ref_lexicon_edits file to the reference lexicon." + echo " ... The user can inspect/modify the edits file and then re-run:" + echo " ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \\" + echo " ... sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon." + cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null + steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \ + sort | uniq > $dest_dict/lexicon.txt || exit 1; +fi + +echo "Lexicon learning ends successfully. Please refer to $dir/lats_iter2/log/lexicon_learning_summary.log" +echo " for a summary. The learned lexicon, whose vocab matches the target_vocab, is $dest_dict/lexicon.txt" diff --git a/egs/wsj/s5/steps/dict/merge_learned_lexicons.py b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py new file mode 100755 index 00000000000..6df7eb7a744 --- /dev/null +++ b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Convert a learned lexicon produced by steps/dict/select_prons_greedy.sh" + "into a lexicon for OOV words (w.r.t. ref. vocab) and a human editable lexicon-edit file." + "for in-vocab words, and generate detailed summaries of the lexicon learning results" + "The inputs are a learned lexicon, an arc-stats file, and three source lexicons " + "(phonetic-decoding(PD)/G2P/ref). The outputs are: a learned lexicon for OOVs" + "(learned_lexicon_oov), and a lexicon_edits file (ref_lexicon_edits) containing" + "suggested modifications of prons, for in-vocab words.", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example.") + parser.add_argument("arc_stats_file", metavar = "", type = str, + help = "File containing word-pronunciation statistics obtained from lattices; " + "each line must be ") + parser.add_argument("word_counts_file", metavar = "", type = str, + help = "File containing word counts in acoustic training data; " + "each line must be .") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "The reference lexicon (most probably hand-derived)." + "Each line must be ") + parser.add_argument("g2p_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from G2P results." + "Each line must be ") + parser.add_argument("pd_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from phonetic decoding results." + "Each line must be ") + parser.add_argument("learned_lexicon", metavar = "", type = str, + help = "Learned lexicon." + "Each line must be ") + parser.add_argument("learned_lexicon_oov", metavar = "", type = str, + help = "Output file which is the learned lexicon for words out of the ref. vocab.") + parser.add_argument("ref_lexicon_edits", metavar = "", type = str, + help = "Output file containing human-readable & editable pronounciation info (and the" + "accept/reject decision made by our algorithm) for those words in ref. vocab," + "to which any change has been recommended. The info for each word is like:" + "------------ an 4086.0 --------------" + "R | Y | 2401.6 | AH N" + "R | Y | 640.8 | AE N" + "P | Y | 1035.5 | IH N" + "R(ef), P(hone-decoding) represents the pronunciation source" + "Y/N means the recommended decision of including this pron or not" + "and the numbers are soft counts accumulated from lattice-align-word outputs. " + "See the function WriteEditsAndSummary for more details.") + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + if args.arc_stats_file == "-": + args.arc_stats_file_handle = sys.stdin + else: + args.arc_stats_file_handle = open(args.arc_stats_file) + args.word_counts_file_handle = open(args.word_counts_file) + args.ref_lexicon_handle = open(args.ref_lexicon) + args.g2p_lexicon_handle = open(args.g2p_lexicon) + args.pd_lexicon_handle = open(args.pd_lexicon) + args.learned_lexicon_handle = open(args.learned_lexicon) + args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w") + args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w") + + return args + +def ReadArcStats(arc_stats_file_handle): + stats = defaultdict(lambda : defaultdict(dict)) + stats_summed = defaultdict(float) + for line in arc_stats_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 5): + raise Exception('Invalid format of line ' + line + + ' in ' + arc_stats_file) + utt = splits[1] + start_frame = int(splits[2]) + word = splits[0] + count = float(splits[3]) + phones = splits[4:] + phones = ' '.join(phones) + stats[word][(utt, start_frame)][phones] = count + stats_summed[(word, phones)] += count + return stats, stats_summed + +def ReadWordCounts(word_counts_file_handle): + counts = {} + for line in word_counts_file_handle.readlines(): + splits = line.strip().split() + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in counts file.') + word = splits[0] + count = int(splits[1]) + counts[word] = count + return counts + +def ReadLexicon(args, lexicon_file_handle, counts): + # we're skipping any word not in counts (not seen in training data), + # cause we're only learning prons for words who have acoustic examples. + lexicon = defaultdict(set) + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + if word not in counts: + continue + phones = ' '.join(splits[1:]) + lexicon[word].add(phones) + return lexicon + +def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed): + # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs. + threshold = 2 + words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we + # classify each word into, according to whether it's count > threshold, + # and whether it's OOVs w.r.t the reference lexicon. + + src = {} + print("# Note: This file contains pronunciation info for words who have candidate " + "prons from G2P/phonetic-decoding accepted in the learned lexicon" + ", sorted by their counts in acoustic training data, " + ,file=args.ref_lexicon_edits_handle) + print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)." + ,file=args.ref_lexicon_edits_handle) + print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle) + print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)." + ,file=args.ref_lexicon_edits_handle) + print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle) + + # words which are to be printed into the edits file. + words_to_edit = [] + num_prons_tot = 0 + for word in learned_lexicon: + num_prons_tot += len(learned_lexicon[word]) + count = len(stats[word]) # This count could be smaller than the count read from the dict "counts", + # since in each sub-utterance, multiple occurences (which is rare) of the same word are compressed into one. + # We use this count here so that in the edit-file, soft counts for each word sum up to one. + flags = ['0' for i in range(3)] # "flags" contains three binary indicators, + # indicating where this word's pronunciations come from. + for pron in learned_lexicon[word]: + if word in pd_lexicon and pron in pd_lexicon[word]: + flags[0] = '1' + src[(word, pron)] = 'P' + elif word in ref_lexicon and pron in ref_lexicon[word]: + flags[1] = '1' + src[(word, pron)] = 'R' + elif word in g2p_lexicon and pron in g2p_lexicon[word]: + flags[2] = '1' + src[(word, pron)] = 'G' + if word in ref_lexicon: + all_ref_prons_accepted = True + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + all_ref_prons_accepted = False + break + if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1': + words_to_edit.append((word, len(stats[word]))) + if count > threshold: + words[0][flags[0] + flags[1] + flags[2]].add(word) + else: + words[1][flags[0] + flags[1] + flags[2]].add(word) + else: + if count > threshold: + words[2][flags[0] + flags[2]].add(word) + else: + words[3][flags[0] + flags[2]].add(word) + + words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True) + for word, count in words_to_edit_sorted: + print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle) + learned_prons = [] + for pron in learned_lexicon[word]: + learned_prons.append((src[(word, pron)], 'Y', stats_summed[(word, pron)], pron)) + for pron in ref_lexicon[word]: + if pron not in learned_lexicon[word]: + learned_prons.append(('R', 'N', stats_summed[(word, pron)], pron)) + learned_prons_sorted = sorted(learned_prons, key=lambda item: item[2], reverse=True) + for item in learned_prons_sorted: + print('{} | {} | {:.2f} | {}'.format(item[0], item[1], item[2], item[3]), file=args.ref_lexicon_edits_handle) + + num_oovs_with_acoustic_evidence = len(set(learned_lexicon.keys()).difference(set(ref_lexicon.keys()))) + num_oovs = len(set(counts.keys()).difference(set(ref_lexicon.keys()))) + num_ivs = len(learned_lexicon) - num_oovs_with_acoustic_evidence + print("Average num. prons per word in the learned lexicon is {}".format(float(num_prons_tot)/float(len(learned_lexicon))), file=sys.stderr) + # print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr) + print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr) + print("We have acoustic evidence for {} out of {} in-vocab (w.r.t the reference lexicon) words from the acoustic training data.".format(num_ivs, len(ref_lexicon)), file=sys.stderr) + print(" Among those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) + num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011']) + num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100']) + num_freq_ivs_from_ref = len(words[0]['010']) + num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011']) + num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100']) + num_infreq_ivs_from_ref = len(words[1]['010']) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr) + print(' {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) + print("---------------------------------------------------------------------------------------------------", file=sys.stderr) + num_freq_oovs_from_both_sources = len(words[2]['11']) + num_freq_oovs_from_phonetic_decoding = len(words[2]['10']) + num_freq_oovs_from_g2p = len(words[2]['01']) + num_infreq_oovs_from_both_sources = len(words[3]['11']) + num_infreq_oovs_from_phonetic_decoding = len(words[3]['10']) + num_infreq_oovs_from_g2p = len(words[3]['01']) + print('We have acoustic evidence for {} out of {} OOV (w.r.t the reference lexicon) words from the acoustic training data.'.format(num_oovs_with_acoustic_evidence, num_oovs), file=sys.stderr) + print(' Among those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) + print(' For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) + print(' {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr) + print(' {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) + print(' {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) + +def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle): + for word, prons in learned_lexicon.iteritems(): + if word not in ref_lexicon: + for pron in prons: + print('{0} {1}'.format(word, pron), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + + # Read in three lexicon sources, word counts, and pron stats. + counts = ReadWordCounts(args.word_counts_file_handle) + ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts) + g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts) + pd_lexicon = ReadLexicon(args, args.pd_lexicon_handle, counts) + stats, stats_summed = ReadArcStats(args.arc_stats_file_handle) + learned_lexicon = ReadLexicon(args, args.learned_lexicon_handle, counts) + + # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov. + WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle) + # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr. + WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed) + +if __name__ == "__main__": + Main() diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py index 2a87d172602..a957b02d3d3 100755 --- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py +++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py @@ -6,6 +6,7 @@ # we're using python 3.x style print but want it to work in python 2.x, from __future__ import print_function +from collections import defaultdict import argparse import sys @@ -21,8 +22,8 @@ def __call__(self, parser, namespace, values, option_string=None): raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) def GetArgs(): - parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) " - "into a lexicon for lexicon learning. We prune the pronunciations " + parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phonetic decoding or g2p) " + "into a lexicon for. We prune the pronunciations " "based on a provided stats file, and optionally filter out entries which are present " "in a filter lexicon.", epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\" @@ -39,6 +40,8 @@ def GetArgs(): action = StrToBoolAction, choices = ["true", "false"], help = "If normalize lexicon such that the max " "probability is 1.") + parser.add_argument("--top-N", type = int, default = 0, + help = "If non-zero, we just take the top N pronunciations (according to stats/pron-probs) for each word.") parser.add_argument("--min-prob", type = float, default = 0.1, help = "Remove pronunciation with probabilities less " "than this value after normalization.") @@ -46,8 +49,7 @@ def GetArgs(): help = "Exclude entries in this filter lexicon from the output lexicon." "each line must be ") parser.add_argument("stats_file", metavar='', type = str, - help = "Input file containing pronunciation statistics, representing how many times " - "each word-pronunciation appear in the phonetic decoding results." + help = "Input lexicon file containing pronunciation statistics/probs in the first column." "each line must be ") parser.add_argument("out_lexicon", metavar='', type = str, help = "Output lexicon.") @@ -150,6 +152,18 @@ def NormalizeLexicon(lexicon, set_max_to_one = True, prob = 0 lexicon[entry] = prob +def TakeTopN(lexicon, top_N): + lexicon_reshaped = defaultdict(list) + lexicon_pruned = {} + for entry, prob in lexicon.iteritems(): + lexicon_reshaped[entry[0]].append([entry[1], prob]) + for word in lexicon_reshaped: + prons = lexicon_reshaped[word] + sorted_prons = sorted(prons, reverse=True, key=lambda prons: prons[1]) + for i in range(len(sorted_prons)): + if i >= top_N: + lexicon[(word, sorted_prons[i][0])] = 0 + def WriteLexicon(args, lexicon, filter_lexicon): words = set() num_removed = 0 @@ -179,10 +193,15 @@ def Main(): word_probs = ConvertWordCountsToProbs(args, lexicon, word_count) lexicon = ConvertWordProbsToLexicon(word_probs) - filter_lexicon = ReadLexicon(args.filter_lexicon_handle) - NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one, - set_sum_to_one = args.set_sum_to_one, - min_prob = args.min_prob) + filter_lexicon = set() + if args.filter_lexicon is not '': + filter_lexicon = ReadLexicon(args.filter_lexicon_handle) + if args.top_N > 0: + TakeTopN(lexicon, args.top_N) + else: + NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one, + set_sum_to_one = args.set_sum_to_one, + min_prob = args.min_prob) WriteLexicon(args, lexicon, filter_lexicon) args.out_lexicon_handle.close() diff --git a/egs/wsj/s5/steps/dict/select_prons_greedy.py b/egs/wsj/s5/steps/dict/select_prons_greedy.py new file mode 100755 index 00000000000..cf71070e134 --- /dev/null +++ b/egs/wsj/s5/steps/dict/select_prons_greedy.py @@ -0,0 +1,376 @@ +#!/usr/bin/env python + +# Copyright 2018 Xiaohui Zhang +# Apache 2.0. + +from __future__ import print_function +from collections import defaultdict +import argparse +import sys +import math + +def GetArgs(): + parser = argparse.ArgumentParser( + description = "Use a greedy framework to select pronunciation candidates" + "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding" + "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:" + "Acoustic data-driven lexicon learning based on a greedy pronunciation " + "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur," + "Interspeech 2017. The inputs are an arc-stats file, containing " + "acoustic evidence (tau_{uwb} in the paper) and three source lexicons " + "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for" + "all words in the arc_stats (acoustic evidence) file.", + epilog = "See steps/dict/learn_lexicon_greedy.sh for example.") + parser.add_argument("--alpha", type = str, default = "0,0,0", + help = "Scaling factors for the likelihood reduction threshold." + "of three pronunciaiton candidate sources: phonetic-decoding (PD)," + "G2P and reference. The valid range of each dimension is [0, 1], and" + "a large value means we prune pronunciations from this source more" + "aggressively. Setting a dimension to zero means we never want to remove" + "pronunciaiton from that source. See Section 4.3 in the paper for details.") + parser.add_argument("--beta", type = str, default = "0,0,0", + help = "smoothing factors for the likelihood reduction term." + "of three pronunciaiton candidate sources: phonetic-decoding (PD)," + "G2P and reference. The valid range of each dimension is [0, 100], and" + "a large value means we prune pronunciations from this source more" + "aggressively. See Section 4.3 in the paper for details.") + parser.add_argument("--delta", type = float, default = 0.000000001, + help = "Floor value of the pronunciation posterior statistics." + "The valid range is (0, 0.01)," + "See Section 3 in the paper for details.") + parser.add_argument("silence_phones_file", metavar = "", type = str, + help = "File containing a list of silence phones.") + parser.add_argument("arc_stats_file", metavar = "", type = str, + help = "File containing word-pronunciation statistics obtained from lattices; " + "each line must be ") + parser.add_argument("word_counts_file", metavar = "", type = str, + help = "File containing word counts in acoustic training data; " + "each line must be .") + parser.add_argument("ref_lexicon", metavar = "", type = str, + help = "The reference lexicon (most probably hand-derived)." + "Each line must be ") + parser.add_argument("g2p_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from G2P results." + "Each line must be ") + parser.add_argument("pd_lexicon", metavar = "", type = str, + help = "Candidate ronouciations from phonetic decoding results." + "Each line must be ") + parser.add_argument("learned_lexicon", metavar = "", type = str, + help = "Learned lexicon.") + + + print (' '.join(sys.argv), file=sys.stderr) + + args = parser.parse_args() + args = CheckArgs(args) + + return args + +def CheckArgs(args): + args.silence_phones_file_handle = open(args.silence_phones_file) + if args.arc_stats_file == "-": + args.arc_stats_file_handle = sys.stdin + else: + args.arc_stats_file_handle = open(args.arc_stats_file) + args.word_counts_file_handle = open(args.word_counts_file) + args.ref_lexicon_handle = open(args.ref_lexicon) + args.g2p_lexicon_handle = open(args.g2p_lexicon) + args.pd_lexicon_handle = open(args.pd_lexicon) + args.learned_lexicon_handle = open(args.learned_lexicon, "w") + + alpha = args.alpha.strip().split(',') + if len(alpha) is not 3: + raise Exception('Invalid alpha ', args.alpha) + for i in range(0,3): + if float(alpha[i]) < 0 or float(alpha[i]) > 1: + raise Exception('alaph ', alpha[i], + ' is invalid, it must be within [0, 1].') + if float(alpha[i]) == 0: + alpha[i] = -1e-3 + # The absolute likelihood loss (search for loss_abs) is supposed to be positive. + # But it could be negative near zero because of numerical precision limit. + # In this case, even if alpha is set to be zero, which means we never want to + # remove pronunciation from that source, the quality score (search for q_b) + # could still be negative, which means this pron could be potentially removed. + # To prevent this, we set alpha as a negative value near zero to ensure + # q_b is always positive. + + args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])] + print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha) + exit + beta = args.beta.strip().split(',') + if len(beta) is not 3: + raise Exception('Invalid beta ', args.beta) + for i in range(0,3): + if float(beta[i]) < 0 or float(beta[i]) > 100: + raise Exception('beta ', beta[i], + ' is invalid, it must be within [0, 100].') + args.beta = [float(beta[0]), float(beta[1]), float(beta[2])] + print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta) + + if args.delta <= 0 or args.delta > 0.1: + raise Exception('delta ', args.delta, ' is invalid, it must be within' + '(0, 0.01).') + print("delta is: ", args.delta) + + return args + +def ReadArcStats(arc_stats_file_handle): + stats = defaultdict(lambda : defaultdict(dict)) + stats_summed = defaultdict(float) + for line in arc_stats_file_handle.readlines(): + splits = line.strip().split() + + if (len(splits) == 0): + continue + + if (len(splits) < 5): + raise Exception('Invalid format of line ' + line + + ' in ' + arc_stats_file) + utt = splits[1] + start_frame = int(splits[2]) + word = splits[0] + count = float(splits[3]) + phones = splits[4:] + phones = ' '.join(phones) + stats[word][(utt, start_frame)][phones] = count + stats_summed[(word, phones)] += count + return stats, stats_summed + +def ReadWordCounts(word_counts_file_handle): + counts = {} + for line in word_counts_file_handle.readlines(): + splits = line.strip().split() + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in counts file.') + word = splits[0] + count = int(splits[1]) + counts[word] = count + return counts + +def ReadLexicon(args, lexicon_file_handle, counts): + # we're skipping any word not in counts (not seen in training data), + # cause we're only learning prons for words who have acoustic examples. + lexicon = defaultdict(set) + for line in lexicon_file_handle.readlines(): + splits = line.strip().split() + if len(splits) == 0: + continue + if len(splits) < 2: + raise Exception('Invalid format of line ' + line + + ' in lexicon file.') + word = splits[0] + if word not in counts: + continue + phones = ' '.join(splits[1:]) + lexicon[word].add(phones) + return lexicon + +def FilterPhoneticDecodingLexicon(args, pd_lexicon): + # We want to remove all candidates which contain silence phones + silphones = set() + for line in args.silence_phones_file_handle: + silphones.add(line.strip()) + rejected_candidates = set() + for word, prons in pd_lexicon.iteritems(): + for pron in prons: + for phone in pron.split(): + if phone in silphones: + rejected_candidates.add((word, pron)) + break + for word, pron in rejected_candidates: + pd_lexicon[word].remove(pron) + return pd_lexicon + +# One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper). +def OneEMIter(args, word, stats, prons, pron_probs, debug=False): + prob_acc = [0.0 for i in range(len(prons[word]))] + s = sum(pron_probs) + for i in range(len(pron_probs)): + pron_probs[i] = pron_probs[i] / s + log_like = 0.0 + for (utt, start_frame) in stats[word]: + prob = [] + soft_counts = [] + for i in range(len(prons[word])): + phones = prons[word][i] + soft_count = stats[word][(utt, start_frame)].get(phones, 0) + if soft_count < args.delta: + soft_count = args.delta + soft_counts.append(soft_count) + prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)] + for i in range(len(prons[word])): + prob_acc[i] += prob[i] / sum(prob) + log_like += math.log(sum(prob)) + pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc] + log_like = 1.0 / float(len(stats[word])) * log_like + if debug: + print("Log_like of the word: ", log_like, "pron probs: ", pron_probs) + return pron_probs, log_like + +def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False): + prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary + src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference + learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary + for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon: + for word in lexicon: + for pron in lexicon[word]: + prons[word].append(pron) + for word in prons: + for pron in prons[word]: + if word in pd_lexicon and pron in pd_lexicon[word]: + src[(word, pron)] = 'P' + if word in g2p_lexicon and pron in g2p_lexicon[word]: + src[(word, pron)] = 'G' + if word in ref_lexicon and pron in ref_lexicon[word]: + src[(word, pron)] = 'R' + + for word in prons: + if word not in stats: + continue + n = len(prons[word]) + pron_probs = [1/float(n) for i in range(n)] + if dianostic_info: + print("pronunciations of word '{}': {}".format(word, prons[word])) + active_indexes = set(range(len(prons[word]))) + + deleted_prons = [] # indexes of prons to be deleted + soft_counts_normalized = [] + while len(active_indexes) > 1: + log_like = 1.0 + log_like_last = -1.0 + num_iters = 0 + while abs(log_like - log_like_last) > 1e-7: + num_iters += 1 + log_like_last = log_like + pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False) + if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration + soft_counts_normalized = pron_probs + if dianostic_info: + print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized)) + if dianostic_info: + print("\n Log_like after {} iters of EM: {}, estimated pron_probs: {} \n".format( + num_iters, log_like, pron_probs)) + candidates_to_delete = [] + + for i in active_indexes: + pron_probs_mod = [p for p in pron_probs] + pron_probs_mod[i] = 0.0 + for j in range(len(pron_probs_mod)): + if j in active_indexes and j != i: + pron_probs_mod[j] += 0.01 + pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod] + log_like2 = 1.0 + log_like2_last = -1.0 + num_iters2 = 0 + # Running EM until convengence + while abs(log_like2 - log_like2_last) > 0.001 : + num_iters2 += 1 + log_like2_last = log_like2 + pron_probs_mod, log_like2 = OneEMIter(args, word, stats, + prons, pron_probs_mod, False) + + loss_abs = log_like - log_like2 # absolute likelihood loss before normalization + # (supposed to be positive, but could be negative near zero because of numerical precision limit). + log_delta = math.log(args.delta) + thr = -log_delta + loss = loss_abs + source = src[(word, prons[word][i])] + if dianostic_info: + print("\n set the pron_prob of '{}' whose source is {}, to zero results in {}" + " loss in avg. log-likelihood; Num. iters until converging:{}. ".format( + prons[word][i], source, loss, num_iters2)) + # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta + # See Sec. 4.3 and Alg. 1 in the paper. + if source == 'P': + thr *= args.alpha[0] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0]) + if source == 'G': + thr *= args.alpha[1] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1]) + if source == 'R': + thr *= args.alpha[2] + loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2]) + if loss - thr < 0: # loss - thr here is just q_b + if dianostic_info: + print("Smoothed log-like loss {} is smaller than threshold {} so that the quality" + "score {} is negative, adding the pron to the list of candidates to delete" + ". ".format(loss, thr, loss-thr)) + candidates_to_delete.append((loss-thr, i)) + if len(candidates_to_delete) == 0: + break + candidates_to_delete_sorted = sorted(candidates_to_delete, + key=lambda candidates_to_delete: candidates_to_delete[0]) + + deleted_candidate = candidates_to_delete_sorted[0] + active_indexes.remove(deleted_candidate[1]) + pron_probs[deleted_candidate[1]] = 0.0 + for i in range(len(pron_probs)): + if i in active_indexes: + pron_probs[i] += 0.01 + pron_probs = [s / sum(pron_probs) for s in pron_probs] + source = src[(word, prons[word][deleted_candidate[1]])] + pron = prons[word][deleted_candidate[1]] + soft_count = soft_counts_normalized[deleted_candidate[1]] + quality_score = deleted_candidate[0] + # This part of diagnostic info provides hints to the user on how to adjust the parameters. + if dianostic_info: + print("removed pron {}, from source {} with quality score {:.5f}".format( + pron, source, quality_score)) + if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5): + print("WARNING: alpha_{pd} or beta_{pd} may be too large!" + " For the word '{}' whose count is {}, the candidate " + " pronunciation from phonetic decoding '{}' with normalized " + " soft count {} (out of 1) is rejected. It shouldn't have been" + " rejected if alpha_{pd} is smaller than {}".format( + word, len(stats[word]), pron, soft_count, -loss / log_delta, + -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])), + file=sys.stderr) + if loss_abs > thr: + print(" or beta_{pd} is smaller than {}".format( + (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr) + if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5): + print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!" + " For the word '{}' whose count is {}, the candidate " + " pronunciation from G2P '{}' with normalized " + " soft count {} (out of 1) is rejected. It shouldn't have been" + " rejected if alpha_{g2p} is smaller than {} ".format( + word, len(stats[word]), pron, soft_count, -loss / log_delta, + -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])), + file=sys.stderr) + if loss_abs > thr: + print(" or beta_{g2p} is smaller than {}.".format(( + loss_abs / thr - 1) * len(stats[word])), file=sys.stderr) + deleted_prons.append(deleted_candidate[1]) + for i in range(len(prons[word])): + if i not in deleted_prons: + learned_lexicon[word].add(prons[word][i]) + + return learned_lexicon + +def WriteLearnedLexicon(learned_lexicon, file_handle): + for word, prons in learned_lexicon.iteritems(): + for pron in prons: + print('{0} {1}'.format(word, pron), file=file_handle) + file_handle.close() + +def Main(): + args = GetArgs() + + # Read in three lexicon sources, word counts, and pron stats. + counts = ReadWordCounts(args.word_counts_file_handle) + ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts) + g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts) + pd_lexicon = ReadLexicon(args, args.pd_lexicon_handle, counts) + stats, stats_summed = ReadArcStats(args.arc_stats_file_handle) + pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon) + + # Select prons to construct the learned lexicon. + learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon) + + # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov. + WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle) + +if __name__ == "__main__": + Main() From ae555cf795004402b540ad876b74a2eb09459358 Mon Sep 17 00:00:00 2001 From: xiaohui-zhang Date: Thu, 25 Oct 2018 11:14:15 -0400 Subject: [PATCH 2/4] added memory compression option for wsj tdnn recipe --- egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh index 1724c057e12..526059b7b90 100755 --- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh +++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh @@ -220,6 +220,7 @@ if [ $stage -le 16 ]; then --chain.apply-deriv-weights=false \ --chain.lm-opts="--num-extra-lm-states=2000" \ --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=10 \ From 7a74551eb26a5cde5cc5586a92833be8a59d0087 Mon Sep 17 00:00:00 2001 From: xiaohui-zhang Date: Thu, 25 Oct 2018 11:29:37 -0400 Subject: [PATCH 3/4] rename the old lexicon learning recipe steps/dict/learn_lexicon.sh to steps/dict/learn_lexicon_bayesian.sh --- .../local/{run_learn_lex.sh => run_learn_lex_bayesian.sh} | 4 ++-- egs/wsj/s5/steps/dict/apply_lexicon_edits.py | 2 +- egs/wsj/s5/steps/dict/get_pron_stats.py | 2 +- egs/wsj/s5/steps/dict/internal/sum_arc_info.py | 2 +- .../dict/{learn_lexicon.sh => learn_lexicon_bayesian.sh} | 0 egs/wsj/s5/steps/dict/prons_to_lexicon.py | 2 +- egs/wsj/s5/steps/dict/prune_pron_candidates.py | 2 +- egs/wsj/s5/steps/dict/select_prons_bayesian.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) rename egs/tedlium/s5_r2/local/{run_learn_lex.sh => run_learn_lex_bayesian.sh} (98%) rename egs/wsj/s5/steps/dict/{learn_lexicon.sh => learn_lexicon_bayesian.sh} (100%) diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh similarity index 98% rename from egs/tedlium/s5_r2/local/run_learn_lex.sh rename to egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh index a2a6f2e46b8..f1497bfe202 100755 --- a/egs/tedlium/s5_r2/local/run_learn_lex.sh +++ b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh @@ -2,7 +2,7 @@ # # This script demonstrates a lexicon learning recipe, which aims to imrove # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes -# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh +# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh # for explanation of the options. # # Copyright 2016 Xiaohui Zhang @@ -78,7 +78,7 @@ fi # Learn a lexicon based on the acoustic training data and the reference lexicon. if [ $stage -le 1 ]; then - steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \ + steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \ --min-prob $min_prob --variants-prob-mass $variants_prob_mass \ --variants-prob-mass-ref $variants_prob_mass_ref \ --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \ diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py index a5bdbc30d46..f8568971fb7 100755 --- a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py +++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py @@ -10,7 +10,7 @@ def GetArgs(): parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon" "to produce a learned lexicon.", - epilog = "See steps/dict/learn_lexicon.sh for example") + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") parser.add_argument("in_lexicon", metavar='', type = str, help = "Input lexicon. Each line must be .") diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py index 41866294723..f6ce8e49807 100755 --- a/egs/wsj/s5/steps/dict/get_pron_stats.py +++ b/egs/wsj/s5/steps/dict/get_pron_stats.py @@ -19,7 +19,7 @@ def GetArgs(): epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|" " steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\" " exp/tri3_lex_0.4_work/lats/pron_stats.txt" - "See steps/dict/learn_lexicon.sh for examples in detail.") + "See steps/dict/learn_lexicon_greedy.sh for examples in detail.") parser.add_argument("arc_info_file", metavar = "", type = str, help = "Input file containing per arc statistics; " diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py index d3913ec954f..5f02bc5fc29 100755 --- a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py +++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py @@ -24,7 +24,7 @@ def GetArgs(): parser = argparse.ArgumentParser( description = "Accumulate statistics from per arc lattice statitics" "for lexicon learning", - epilog = "See steps/dict/learn_lexicon.sh for example") + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") parser.add_argument("--set-sum-to-one", type = str, default = True, action = StrToBoolAction, choices = ["true", "false"], diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh similarity index 100% rename from egs/wsj/s5/steps/dict/learn_lexicon.sh rename to egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py index a957b02d3d3..37d7810411b 100755 --- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py +++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py @@ -30,7 +30,7 @@ def GetArgs(): "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\" "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\" "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt" - "See steps/dict/learn_lexicon.sh for examples in detail.") + "See steps/dict/learn_lexicon_greedy.sh for examples in detail.") parser.add_argument("--set-sum-to-one", type = str, default = False, action = StrToBoolAction, choices = ["true", "false"], diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py index affc5b17705..0f64f38b785 100755 --- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py +++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py @@ -16,7 +16,7 @@ def GetArgs(): "(For words in the reference lexicon, N = # pron variants given by the reference" "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." "r is a user-specified constant, like 2.", - epilog = "See steps/dict/learn_lexicon.sh for example") + epilog = "See steps/dict/learn_lexicon_geedy.sh for example") parser.add_argument("--r", type = float, default = "2.0", help = "a user-specified ratio parameter which determines how many" diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py index e728a4af0b8..4ccca302ebf 100755 --- a/egs/wsj/s5/steps/dict/select_prons_bayesian.py +++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py @@ -23,7 +23,7 @@ def GetArgs(): "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov)," "and a lexicon_edits file containing suggested modifications of prons, for" "words within the ref. vocab (ref_lexicon_edits).", - epilog = "See steps/dict/learn_lexicon.sh for example.") + epilog = "See steps/dict/learn_lexicon_bayesian.sh for example.") parser.add_argument("--prior-mean", type = str, default = "0,0,0", help = "Mean of priors (summing up to 1) assigned to three exclusive n" "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We " From b147dc87f7665099f87fef4f03a3a3dd0d282893 Mon Sep 17 00:00:00 2001 From: xiaohui-zhang Date: Fri, 14 Dec 2018 01:25:39 -0500 Subject: [PATCH 4/4] fixed typos --- egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh | 2 +- egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh | 2 +- egs/wsj/s5/steps/dict/prune_pron_candidates.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh index ae9681ebab3..042f8f94da4 100755 --- a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh +++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh @@ -28,7 +28,7 @@ # Begin configuration section. -cmd=queue.pl +cmd=run.pl nj=4 stage=0 diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh index 83aa98c1700..56e85f20d62 100755 --- a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh +++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh @@ -40,7 +40,7 @@ stage=0 # Begin configuration section. -cmd=queue.pl +cmd=run.pl nj= stage=0 oov_symbol= diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py index 0f64f38b785..e32478cecea 100755 --- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py +++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py @@ -16,7 +16,7 @@ def GetArgs(): "(For words in the reference lexicon, N = # pron variants given by the reference" "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)." "r is a user-specified constant, like 2.", - epilog = "See steps/dict/learn_lexicon_geedy.sh for example") + epilog = "See steps/dict/learn_lexicon_greedy.sh for example") parser.add_argument("--r", type = float, default = "2.0", help = "a user-specified ratio parameter which determines how many"