From 8dcfb6b615af0a2eb036c7e243634a280e21222d Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Mon, 8 Oct 2018 19:12:55 -0400
Subject: [PATCH 1/4] Added lexicon learning (adaptation) recipe for tedlium,
 in accordance with the IS17 paper.

---
 .../s5_r2/local/run_learn_lex_greedy.sh       | 133 +++++
 egs/wsj/s5/steps/dict/get_pron_stats.py       |  19 +-
 .../s5/steps/dict/internal/get_subsegments.py | 140 +++++
 .../dict/internal/prune_pron_candidates.py    | 122 ++--
 .../s5/steps/dict/internal/sum_arc_info.py    | 136 +++++
 egs/wsj/s5/steps/dict/learn_lexicon.sh        |  43 +-
 egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh | 546 ++++++++++++++++++
 .../s5/steps/dict/merge_learned_lexicons.py   | 261 +++++++++
 egs/wsj/s5/steps/dict/prons_to_lexicon.py     |  35 +-
 egs/wsj/s5/steps/dict/select_prons_greedy.py  | 376 ++++++++++++
 10 files changed, 1733 insertions(+), 78 deletions(-)
 create mode 100755 egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
 create mode 100755 egs/wsj/s5/steps/dict/internal/get_subsegments.py
 create mode 100755 egs/wsj/s5/steps/dict/internal/sum_arc_info.py
 create mode 100755 egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
 create mode 100755 egs/wsj/s5/steps/dict/merge_learned_lexicons.py
 create mode 100755 egs/wsj/s5/steps/dict/select_prons_greedy.py
diff --git a/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
new file mode 100755
index 00000000000..f69af3fe360
--- /dev/null
+++ b/egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
@@ -0,0 +1,133 @@
+#! /bin/bash
+#
+# This script demonstrates a lexicon learning recipe, which aims to imrove
+# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh
+# for explanation of the options. 
+#
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0
+
+. ./cmd.sh
+. ./path.sh
+
+oov_symbol="<unk>"
+# The user may have an phonetisaurus-trained English g2p model ready.
+g2p_mdl_dir=
+# The dir which contains the reference lexicon (most probably hand-derived)
+# we want to expand/improve, and nonsilence_phones.txt,.etc which we need  
+# for building new dict dirs.
+ref_dict=data/local/dict
+# acoustic training data we use to get alternative
+# pronunciations and collet acoustic evidence.
+data=data/train
+# the cut-off parameter used to select pronunciation candidates from phone
+# decoding. We remove pronunciations with probabilities less than this value
+# after normalizing the probs s.t. the max-prob is 1.0 for each word."
+min_prob=0.1
+# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of
+# alpha, beta and delta. Basically, the three dimensions of alpha
+# and beta correspond to three pronunciation sources: phonetic-
+# decoding, G2P and the reference lexicon, and the larger a value is,
+# the more aggressive we'll prune pronunciations from that sooure.
+# The valid range of each dim. is [0, 1] (for alpha, and 0 means 
+# we never pruned pron from that source.) [0, 100] (for beta). 
+alpha="0.04,0.02,0"
+beta="30,5,0"
+# Floor value of the pronunciation posterior statistics.
+delta=0.00000001
+# This parameter determines how many pronunciations we keep for each word
+# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py
+# for details.
+vcr=16 
+
+# Intermediate outputs of the lexicon learning stage will be put into dir
+dir=exp/tri3_lex_greedy_work
+nj=35
+decode_nj=30
+stage=0
+lexlearn_stage=0
+affix="learned_greedy"
+
+. utils/parse_options.sh # accept options
+
+# The reference vocab is the list of words which we already have hand-derived pronunciations.
+ref_vocab=data/local/vocab.txt
+cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1; 
+
+# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
+# in acoustic training data.
+if [ $stage -le 0 ]; then
+  if [ -z $g2p_mdl_dir ]; then
+    g2p_mdl_dir=exp/g2p_phonetisaurus
+    steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
+  fi
+  awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
+    $data/text | sort -u > $data/train_vocab.txt || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
+    $data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
+  steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \
+    exp/g2p_phonetisaurus/lex_train || exit 1;
+fi
+
+# Learn a lexicon based on the acoustic training data and the reference lexicon.
+if [ $stage -le 1 ]; then
+  steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \
+    --alpha $alpha --beta $beta --delta $delta \
+    --min-prob $min_prob --cmd "$train_cmd" \
+    --variant-counts-ratio $vcr \
+    --stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \
+    $ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \
+    $dir || exit 1;
+fi
+
+# Add pronounciation probs to the learned lexicon.
+if [ $stage -le 2 ]; then
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1;
+  
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    $data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1;
+  
+  steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1;
+  
+  utils/dict_dir_add_pronprobs.sh --max-normalize true \
+    data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \
+    exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1;
+  
+  utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
+    data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1;
+fi
+
+# Re-decode
+if [ $stage -le 3 ]; then
+  ! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\
+    echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible."
+  cp data/lang_nosp/G.fst data/lang_${affix}/
+  utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1;
+  
+  for dset in dev test; do
+  (  steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd"  --num-threads 4 \
+      exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1;
+  ) &
+  done
+fi
+
+# RESULTS:
+# Baseline:
+# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
+# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys
+
+# Re-decoding with the learned lexicon:
+# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys
+# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys
+
+# To see the effect to neural-net results, one should re-train NN with the learned lexicon.
+# Experiments have shown that, with the new lang dir, one should just re-run NN training
+# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should
+# expect improved overall WERs and word recognition performance on words whose pronunciations
+# were changed.
+
+exit
+wait
diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py
index b5202a69abb..41866294723 100755
--- a/egs/wsj/s5/steps/dict/get_pron_stats.py
+++ b/egs/wsj/s5/steps/dict/get_pron_stats.py
@@ -10,15 +10,16 @@
 import sys
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon"
-                                     "learning. The inputs are a file containing arc level information from lattice-align-words,"
-                                     "and a map which maps word-position-dependent phones to word-position-independent phones"
-                                     "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
-                                     "of pronunciations",
-                                     epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
-                                              "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
-                                              "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
-                                              "See steps/dict/learn_lexicon.sh for examples in detail.")
+    parser = argparse.ArgumentParser(
+        description = "Accumulate statistics from lattice-alignment outputs for lexicon"
+        "learning. The inputs are a file containing arc level information from lattice-align-words,"
+        "and a map which maps word-position-dependent phones to word-position-independent phones"
+        "(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
+        "of pronunciations",
+        epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
+        "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
+        "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
+        "See steps/dict/learn_lexicon.sh for examples in detail.")
 
     parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                         help = "Input file containing per arc statistics; "
diff --git a/egs/wsj/s5/steps/dict/internal/get_subsegments.py b/egs/wsj/s5/steps/dict/internal/get_subsegments.py
new file mode 100755
index 00000000000..c431b4c7066
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/internal/get_subsegments.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Xiaohui Zhang
+# Apache 2.0.
+
+# we're using python 3.x style print but want it to work in python 2.x,
+from __future__ import print_function
+import argparse
+import sys
+import string
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "The purpose of this script is to use a ctm and a vocab file"
+        "to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
+        "are all the strings of consecutive in-vocab words from the ctm"
+        "surrounded by an out-of-vocab word at each end if present.",
+        epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
+        "exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
+        "exp/tri3_lex_0.4_work/resegmentation/text"
+        "See steps/dict/learn_lexicon_greedy.sh for an example.")
+
+    parser.add_argument("ctm", metavar='<ctm>', type = str,
+                        help = "Input ctm file."
+                        "each line must be <utt-id> <chanel> <start-time> <duration> <word>")
+    parser.add_argument("vocab", metavar='<vocab>', type = str,
+                        help = "Vocab file."
+                        "each line must be <word>")
+    parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
+                        help = "Subsegment file. Each line is in format:"
+                        "<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
+    parser.add_argument("text", metavar='<text>', type = str,
+                        help = "Text file. Each line is in format:"
+                        " <new-utt> <word1> <word2> ... <wordN>.")
+  
+    print (' '.join(sys.argv), file = sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.ctm == "-":
+        args.ctm_handle = sys.stdin
+    else:
+        args.ctm_handle = open(args.ctm)
+
+    if args.vocab is not '':
+        if args.vocab == "-":
+            args.vocab_handle = sys.stdout
+        else:
+            args.vocab_handle = open(args.vocab)
+
+    args.subsegment_handle = open(args.subsegment, 'w')
+    args.text_handle = open(args.text, 'w')
+
+    return args
+
+def GetSubsegments(args, vocab):
+    sub_utt = list()
+    last_is_oov = False
+    is_oov = False
+    utt_id_last = None
+    start_times = {}
+    end_times = {}
+    sub_utts = {}
+    sub_utt_id = 1
+    sub_utt_id_last = 1
+    end_time_last = 0.0
+    for line in args.ctm_handle:
+        splits = line.strip().split()
+        if len(splits) < 5:
+            raise Exception("problematic line",line)
+
+        utt_id = splits[0]
+        start = float(splits[2])
+        dur = float(splits[3])
+        word = splits[4]
+        if utt_id != utt_id_last:
+            sub_utt_id = 1
+            if len(sub_utt)>1:
+                sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
+                end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
+            sub_utt = []
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+            is_oov_last = False
+        if word == '<eps>':
+            is_oov = True
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        elif word in vocab:
+            is_oov = True
+            sub_utt.append(word)
+            end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
+        else:
+            is_oov = False
+            if is_oov_last == True:
+                sub_utt.append(word)
+                sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+                end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+                sub_utt_id += 1
+            sub_utt = [word]
+            start_times[utt_id+'-'+str(sub_utt_id)] = start
+        utt_id_last = utt_id
+        sub_utt_id_last = sub_utt_id
+        is_oov_last = is_oov
+        ent_time_last = start + dur
+        
+    if is_oov:
+        if word != '<eps>':
+            sub_utt.append(word)
+        sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
+        end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
+
+    for utt,v in sorted(sub_utts.items()):
+        print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
+        print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)
+
+def ReadVocab(vocab_file_handle):
+    vocab = set()
+    if vocab_file_handle:
+        for line in vocab_file_handle.readlines():
+            splits = line.strip().split()
+            if len(splits) == 0:
+                continue
+            if len(splits) > 1:
+                raise Exception('Invalid format of line ' + line
+                                    + ' in vocab file.')
+            word = splits[0]
+            vocab.add(word)
+    return vocab
+
+def Main():
+    args = GetArgs()
+
+    vocab = ReadVocab(args.vocab_handle)
+    GetSubsegments(args, vocab)
+   
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
index 1f2863424f3..60c7f75bbe8 100755
--- a/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
+++ b/egs/wsj/s5/steps/dict/internal/prune_pron_candidates.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright 2016  Xiaohui Zhang
+# Copyright 2018  Xiaohui Zhang
 # Apache 2.0.
 
 from __future__ import print_function
@@ -10,27 +10,36 @@
 import math
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
-                                     "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
-                                     "cadidates according to their soft-counts, and then select the top r * N candidates"
-                                     "(For words in the reference lexicon, N = # pron variants given by the reference"
-                                     "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
-                                     "r is a user-specified constant, like 2.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
-
-    parser.add_argument("--r", type = float, default = "2.0",
-                        help = "a user-specified ratio parameter which determines how many"
-                        "pronunciation candidates we want to keep for each word.")
+    parser = argparse.ArgumentParser(
+        description = "Prune pronunciation candidates based on soft-counts from lattice-alignment"
+        "outputs, and a reference lexicon. Basically, for each word we sort all pronunciation"
+        "cadidates according to their soft-counts, and then select the top variant-counts-ratio * N candidates"
+        "(For words in the reference lexicon, N = # pron variants given by the reference"
+        "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon).",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
+
+    parser.add_argument("--variant-counts-ratio", type = float, default = "3.0",
+                        help = "A user-specified ratio parameter which determines how many"
+                        "pronunciation candidates we want to keep for each word at most.")
     parser.add_argument("pron_stats", metavar = "<pron-stats>", type = str,
-                        help = "File containing soft-counts of all pronounciation candidates; "
+                        help = "File containing soft-counts of pronounciation candidates; "
                         "each line must be <soft-counts> <word> <phones>")
+    parser.add_argument("lexicon_phonetic_decoding", metavar = "<lexicon-phonetic-decoding>", type = str,
+                        help = "Lexicon containing pronunciation candidates from phonetic decoding."
+                        "each line must be <word> <phones>")
+    parser.add_argument("lexiconp_g2p", metavar = "<lexiconp-g2p>", type = str,
+                        help = "Lexicon with probabilities for pronunciation candidates from G2P."
+                        "each line must be <prob> <word> <phones>")
     parser.add_argument("ref_lexicon", metavar = "<ref-lexicon>", type = str,
                         help = "Reference lexicon file, where we obtain # pron variants for"
                         "each word, based on which we prune the pron candidates."
                         "Each line must be <word> <phones>")
-    parser.add_argument("pruned_prons", metavar = "<pruned-prons>", type = str,
-                        help = "An output file in lexicon format, which contains prons we want to" 
-                        "prune off from the pron_stats file.")
+    parser.add_argument("lexicon_phonetic_decoding_pruned", metavar = "<lexicon-phonetic-decoding-pruned>", type = str,
+                        help = "Output lexicon containing pronunciation candidates from phonetic decoding after pruning."
+                        "each line must be <word> <phones>")
+    parser.add_argument("lexicon_g2p_pruned", metavar = "<lexicon-g2p-pruned>", type = str,
+                        help = "Output lexicon containing pronunciation candidates from G2P after pruning."
+                        "each line must be <word> <phones>")
 
     print (' '.join(sys.argv), file=sys.stderr)
 
@@ -40,12 +49,13 @@ def GetArgs():
     return args
 
 def CheckArgs(args):
+    print(args)
     args.pron_stats_handle = open(args.pron_stats)
+    args.lexicon_phonetic_decoding_handle = open(args.lexicon_phonetic_decoding)
+    args.lexiconp_g2p_handle = open(args.lexiconp_g2p)
     args.ref_lexicon_handle = open(args.ref_lexicon)
-    if args.pruned_prons == "-":
-        args.pruned_prons_handle = sys.stdout
-    else:
-        args.pruned_prons_handle = open(args.pruned_prons, "w")
+    args.lexicon_phonetic_decoding_pruned_handle = open(args.lexicon_phonetic_decoding_pruned, "w")
+    args.lexicon_g2p_pruned_handle = open(args.lexicon_g2p_pruned, "w")
     return args
 
 def ReadStats(pron_stats_handle):
@@ -62,13 +72,11 @@ def ReadStats(pron_stats_handle):
         phones = ' '.join(splits[2:])
         stats[word].append((phones, count))
 
-    for word, entry in stats.iteritems():
-        entry.sort(key=lambda x: x[1])
     return stats
 
-def ReadLexicon(ref_lexicon_handle):
-    ref_lexicon = defaultdict(set)
-    for line in ref_lexicon_handle.readlines():
+def ReadLexicon(lexicon_handle):
+    lexicon = defaultdict(set)
+    for line in lexicon_handle.readlines():
         splits = line.strip().split()
         if len(splits) == 0:
             continue
@@ -77,42 +85,74 @@ def ReadLexicon(ref_lexicon_handle):
                                 + ' in lexicon file.')
         word = splits[0]
         phones = ' '.join(splits[1:])
-        ref_lexicon[word].add(phones)
-    return ref_lexicon
+        lexicon[word].add(phones)
+    return lexicon
 
-def PruneProns(args, stats, ref_lexicon):
+def ReadLexiconp(lexiconp_handle):
+    lexicon = defaultdict(set)
+    pron_probs = defaultdict(float)
+    for line in lexiconp_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 3:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[1]
+        prob = float(splits[0])
+        phones = ' '.join(splits[2:])
+        pron_probs[(word, phones)] = prob
+        lexicon[word].add(phones)
+    return lexicon, pron_probs
+
+def PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs):
+    # For those pron candidates from lexicon_phonetic_decoding/g2p which don't
+    # have stats, we append them to the "stats" dict, with a zero count.
+    for word, entry in stats.iteritems():
+        prons_with_stats = set()
+        for (pron, count) in entry:
+            prons_with_stats.add(pron)
+        for pron in lexicon_g2p[word]:
+            if pron not in prons_with_stats:
+                entry.append((pron, lexicon_g2p_probs[(word, pron)]-1.0))
+        entry.sort(key=lambda x: x[1])
+    
     # Compute the average # pron variants counts per word in the reference lexicon.
     num_words_ref = 0
     num_prons_ref = 0
     for word, prons in ref_lexicon.iteritems():
         num_words_ref += 1
         num_prons_ref += len(prons)
-    avg_variants_counts_ref = math.ceil(float(num_prons_ref) / float(num_words_ref))
-
+    avg_variant_counts_ref = round(float(num_prons_ref) / float(num_words_ref))
     for word, entry in stats.iteritems():
         if word in ref_lexicon:
-            variants_counts = args.r * len(ref_lexicon[word])
+            variant_counts = args.variant_counts_ratio * len(ref_lexicon[word])
         else:
-            variants_counts = args.r * avg_variants_counts_ref
+            variant_counts = args.variant_counts_ratio * avg_variant_counts_ref
         num_variants = 0
-        while num_variants < variants_counts:
+        count = 0.0
+        while num_variants < variant_counts:
             try:
-                pron, prob = entry.pop()
-                if word not in ref_lexicon or pron not in ref_lexicon[word]:
+                pron, count = entry.pop()
+                if word in ref_lexicon and pron in ref_lexicon[word]:
+                    continue
+                if pron in lexicon_phonetic_decoding[word]:
+                    num_variants += 1
+                    print('{0} {1}'.format(word, pron), file=args.lexicon_phonetic_decoding_pruned_handle)
+                if pron in lexicon_g2p[word]:
                     num_variants += 1
+                    print('{0} {1}'.format(word, pron), file=args.lexicon_g2p_pruned_handle)
             except IndexError:
                 break
-        
-    for word, entry in stats.iteritems():
-        for pron, prob in entry:
-            if word not in ref_lexicon or pron not in ref_lexicon[word]:
-                print('{0} {1}'.format(word, pron), file=args.pruned_prons_handle)
 
 def Main():
     args = GetArgs()
     ref_lexicon = ReadLexicon(args.ref_lexicon_handle)
+    lexicon_phonetic_decoding = ReadLexicon(args.lexicon_phonetic_decoding_handle)
+    lexicon_g2p, lexicon_g2p_probs = ReadLexiconp(args.lexiconp_g2p_handle)
     stats = ReadStats(args.pron_stats_handle)
-    PruneProns(args, stats, ref_lexicon)
+
+    PruneProns(args, stats, ref_lexicon, lexicon_phonetic_decoding, lexicon_g2p, lexicon_g2p_probs)
 
 if __name__ == "__main__":
     Main()
diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
new file mode 100755
index 00000000000..d3913ec954f
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+
+# Copyright 2018   Xiaohui Zhang
+# Apache 2.0
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+
+class StrToBoolAction(argparse.Action):
+    """ A custom action to convert bools from shell format i.e., true/false
+        to python format i.e., True/False """
+    def __call__(self, parser, namespace, values, option_string=None):
+        if values == "true":
+            setattr(namespace, self.dest, True)
+        elif values == "false":
+            setattr(namespace, self.dest, False)
+        else:
+            raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
+
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Accumulate statistics from per arc lattice statitics"
+        "for lexicon learning",
+        epilog = "See steps/dict/learn_lexicon.sh for example")
+
+    parser.add_argument("--set-sum-to-one", type = str, default = True,
+                        action = StrToBoolAction, choices = ["true", "false"],
+                        help = "If normalize posteriors such that the sum of "
+                        "pronunciation posteriors of a word in an utterance is 1.")
+    parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
+                        help = "File containing per arc statistics; "
+                        "each line must be <utt-id> <word> <start-frame> <duration> <posterior>"
+                        "<phones-with-word-boundary-markers>")
+    parser.add_argument("phone_map", metavar = "<phone-map>", type = str,
+                        help = "An input phone map used to remove word boundary markers from phones;"
+                        "generated in steps/cleanup/debug_lexicon.sh")
+    parser.add_argument("stats_file", metavar = "<out-stats-file>", type = str,
+                        help = "Write accumulated statitistics to this file"
+                        "each line is <utt-id> <word> <start-frame> <posterior>"
+                        "<phones-without-word-boundary-markers>")
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.arc_info_file == "-":
+        args.arc_info_file_handle = sys.stdin
+    else:
+        args.arc_info_file_handle = open(args.arc_info_file)
+    
+    args.phone_map_handle = open(args.phone_map)
+
+    if args.stats_file == "-":
+        args.stats_file_handle = sys.stdout
+    else:
+        args.stats_file_handle = open(args.stats_file, "w")
+
+    return args
+
+def Main():
+    args = GetArgs()
+
+    lexicon = defaultdict(list)
+    prons = defaultdict(list)
+    start_frames = {}
+    stats = defaultdict(lambda : defaultdict(float))
+    sum_tot = defaultdict(float)
+
+    phone_map = {}
+    for line in args.phone_map_handle.readlines():
+        splits = line.strip().split()
+        phone_map[splits[0]] = splits[1]
+
+    for line in args.arc_info_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 6):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + args.arc_info_file)
+
+        utt = splits[0]
+        start_frame = int(splits[1])
+        word = splits[4]
+        count = float(splits[3])
+        phones_unmapped = splits[5:]   
+        phones = [phone_map[phone] for phone in phones_unmapped]
+        phones = ' '.join(phones)
+        overlap = False
+        if word == '<eps>':
+            continue
+        if (word, utt) not in start_frames:
+            start_frames[(word, utt)] = start_frame
+
+        if (word, utt) in stats:
+            stats[word, utt][phones] = stats[word, utt].get(phones, 0) + count
+        else:
+            stats[(word, utt)][phones] = count
+        sum_tot[(word, utt)] += count
+
+        if phones not in prons[word]:
+            prons[word].append(phones)
+
+    for (word, utt) in stats:
+       count_sum = 0.0
+       counts = dict()
+       for phones in stats[(word, utt)]:
+           count = stats[(word, utt)][phones]
+           count_sum += count
+           counts[phones] = count
+       # By default we normalize the pron posteriors of each word in each utterance,
+       # so that they sum up exactly to one. If a word occurs two times in a utterance,
+       # the effect of this operation is to average the posteriors of these two occurences
+       # so that there's only one "equivalent occurence" of this word in the utterance.
+       # However, this case should be extremely rare if the utterances are already
+       # short sub-utterances produced by steps/dict/internal/get_subsegments.py
+       for phones in stats[(word, utt)]:
+           count = counts[phones] / count_sum
+           print(word, utt, start_frames[(word, utt)], count, phones, file=args.stats_file_handle)
+       # # Diagnostics info implying incomplete arc_info or multiple occurences of a word in a utterance:
+       # if count_sum < 0.9 or count_sum > 1.1:
+       #    print(word, utt, start_frame, count_sum, stats[word, utt], file=sys.stderr)
+
+    args.stats_file_handle.close()
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon.sh
index a719422b593..ae9681ebab3 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon.sh
@@ -28,7 +28,7 @@
 
 
 # Begin configuration section.  
-cmd=run.pl
+cmd=queue.pl
 nj=4
 stage=0
 
@@ -36,6 +36,7 @@ oov_symbol=
 lexicon_g2p=
 
 min_prob=0.3
+variant_counts_ratio=8 
 variants_prob_mass=0.7
 variants_prob_mass_ref=0.9
 
@@ -93,6 +94,10 @@ if [ $# -lt 6 ] || [ $# -gt 7 ]; then
   echo "  --min-prob <float>           # The cut-off parameter used to select pronunciation candidates from phonetic"
   echo "                               # decoding. We remove pronunciations with probabilities less than this value"
   echo "                               # after normalizing the probs s.t. the max-prob is 1.0 for each word."
+  echo "  --variant-counts-ratio <int> # This ratio parameter determines the maximum number of pronunciation"
+  echo "                               # candidates we will keep for each word, after pruning according to lattice statistics from"
+  echo "                               # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
+  echo "                               # for details."
   echo "  --prior-mean                 # Mean of priors (summing up to 1) assigned to three exclusive pronunciation"
   echo "         <float,float,float>   # source: reference lexicon, g2p, and phonetic decoding (used in the Bayesian"
   echo "                               # pronunciation selection procedure). We recommend setting a larger prior"
@@ -150,17 +155,17 @@ if [ $stage -le 0 ]; then
 
   # Remove non-scored-words from the reference lexicon.
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
-    $ref_dict/lexicon.txt | tr -s '\t' ' ' > $dir/ref_lexicon.txt
+    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt
 
   cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
     $target_vocab | sort | uniq > $dir/target_vocab.txt
     
   # From the reference lexicon, we estimate the target_num_prons_per_word as,
-  # ceiling(avg. # prons per word in the reference lexicon). This'll be used as 
+  # round(avg. # prons per word in the reference lexicon). This'll be used as 
   # the upper bound of # pron variants per word when we apply G2P or select prons to
   # construct the learned lexicon in later stages.
-  python -c 'import sys; import math; print int(math.ceil(float(sys.argv[1])/float(sys.argv[2])))' \
+  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
     `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
     > $dir/target_num_prons_per_word || exit 1;
 
@@ -225,10 +230,11 @@ if [ $stage -le 2 ]; then
 
   # Get the oov words list (w.r.t ref vocab) which are in training data. 
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
-    $dir/train_counts.txt | sort > $dir/oov_train.txt 
+    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
+    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
   
   awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
-    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate
+    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
   
   echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
   cat $dir/train_oov_rate
@@ -237,14 +243,14 @@ if [ $stage -le 2 ]; then
   # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
   # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
   awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
-    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt
+    $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
   
   # Get the pronunciation of oov_symbol.
-  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | cut -f2- -d' '`
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
   # For oov words in training data for which we don't even have G2P pron candidates,
   # we simply assign them the pronunciation of the oov symbol (like <unk>).
   awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
-    $dir/oov_train.txt | awk -v op=$oov_pron '{print $0" "op}' > $dir/oov_train_no_pron.txt
+    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
     
   cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
     awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
@@ -263,7 +269,7 @@ if [ $stage -le 3 ]; then
   
   # We prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
   # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
-  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt > $dir/phonetic_decoding/filter_lexicon.txt 
+  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
   
   $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
     --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
@@ -295,7 +301,7 @@ if [ $stage -le 4 ]; then
   
   # Generate lattices for the acoustic training data with the combined lexicon.
   if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
-  steps/align_fmllr_lats.sh --cmd "$decode_cmd" --nj $nj \
+  steps/align_fmllr_lats.sh --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
     $data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;
 
   # Get arc level information from the lattice.
@@ -321,13 +327,10 @@ if [ $stage -le 5 ]; then
   rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null
 
   # Prune away pronunciations which have low acoustic evidence from the first pass of lattice alignment.
-  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py $dir/lats_iter1/pron_stats.txt $dir/ref_lexicon.txt $dir/pruned_prons.txt
- 
-  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_phonetic_decoding.txt \
-    > $dir/lexicon_phonetic_decoding_pruned.txt
-
-  awk 'NR==FNR{a[$0] = 1; next} (!($0 in a))' $dir/pruned_prons.txt $dir/lexicon_g2p.txt \
-    > $dir/lexicon_g2p_pruned.txt \
+  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
+    --variant-counts-ratio $variant_counts_ratio \
+    $dir/lats_iter1/pron_stats.txt $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt
 
   # Filter out words which don't appear in the acoustic training data
   cat $dir/lexicon_phonetic_decoding_pruned.txt $dir/lexicon_g2p_pruned.txt \
@@ -402,7 +405,7 @@ if [ $stage -le 7 ]; then
   # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
   cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
     awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
-    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;
 
   awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
     $dir/lexicon_g2p.txt > $dir/g2p_prons_for_oov_no_acoustics.txt
@@ -426,5 +429,5 @@ if [ $stage -le 8 ]; then
   echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
   cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
   steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
-    sort | uniq > $dest_dict/lexicon.txt
+    sort | uniq > $dest_dict/lexicon.txt || exit 1;
 fi
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
new file mode 100755
index 00000000000..83aa98c1700
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
@@ -0,0 +1,546 @@
+#! /bin/bash
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0
+
+# This recipe has similar inputs and outputs as steps/dict/learn_lexicon.sh
+# The major difference is, instead of using a Bayesian framework for 
+# pronunciation selection, we used a likelihood-reduction based greedy 
+# pronunciation selection framework presented in the paper:
+# "Acoustic data-driven lexicon learning based on a greedy pronunciation "
+# "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
+# "Interspeech 2017."
+
+# This script demonstrate how to expand a existing lexicon using a combination
+# of acoustic evidence and G2P to learn a lexicon that covers words in a target 
+# vocab, and agrees sufficiently with the acoustics. The basic idea is to 
+# run phonetic decoding on acoustic training data using an existing
+# acoustice model (possibly re-trained using a G2P-expanded lexicon) to get 
+# alternative pronunciations for words in training data. Then we combine three
+# exclusive sources of pronunciations: the reference lexicon (supposedly 
+# hand-derived), phonetic decoding, and G2P (optional) into one lexicon and then run 
+# lattice alignment on the same data, to collect acoustic evidence (soft
+# counts) of all pronunciations. Based on these statistics, we use a greedy
+# framework (see steps/dict/select_prons_greedy.sh for details) to select an
+# informative subset of pronunciations for each word with acoustic evidence. 
+# two important parameters are alpha and beta. Basically, the three dimensions of alpha
+# and beta correspond to three pronunciation sources: phonetic-decoding, G2P and
+# the reference lexicon, and the larger a value is, the more aggressive we'll
+# prune pronunciations from that sooure. The valid range of each dim. is [0, 1]
+# (for alpha, and 0 means we never pruned pron from that source.) [0, 100] (for beta). 
+# The output of steps/dict/select_prons_greedy.sh is a learned lexicon whose vocab 
+# matches the user-specified target-vocab, and two intermediate outputs which were
+# used to generate the learned lexicon: an edits file which records the recommended
+# changes to all in-ref-vocab words' prons, and a half-learned lexicon
+# ($dest_dict/lexicon0.txt) where all in-ref-vocab words' prons were untouched
+# (on top of which we apply the edits file to produce the final learned lexicon). 
+# The user can always modify the edits file manually and then re-apply it on the 
+# half-learned lexicon using steps/dict/apply_lexicon_edits.sh to produce the 
+# final learned lexicon. See the last stage in this script for details.
+
+stage=0
+# Begin configuration section.  
+cmd=queue.pl
+nj=
+stage=0
+oov_symbol=
+lexiconp_g2p=
+min_prob=0.3
+variant_counts_ratio=8 
+variant_counts_no_acoustics=1 
+alpha="0,0,0"
+beta="0,0,0"
+delta=0.0000001
+num_gauss=
+num_leaves=
+retrain_src_mdl=true
+cleanup=true
+nj_select_prons=200
+learn_iv_prons=false # whether we want to learn the prons of IV words (w.r.t. ref_vocab), 
+
+# End configuration section.  
+
+. ./path.sh
+. utils/parse_options.sh
+
+if [ $# -lt 6 ] || [ $# -gt 7 ]; then
+  echo "Usage: $0 [options] <ref-dict> <target-vocab> <data> <src-mdl-dir> \\"
+  echo "          <ref-lang> <dest-dict> <dir>."
+  echo "  This script does lexicon expansion using a combination of acoustic"
+  echo "  evidence and G2P to produce a lexicon that covers words of a target vocab:"
+  echo ""               
+  echo "Arguments:"
+  echo " <ref-dict>     The dir which contains the reference lexicon (most probably hand-derived)"
+  echo "                we want to expand/improve, and nonsilence_phones.txt,.etc which we need " 
+  echo "                for building new dict dirs."
+  echo " <target-vocab> The vocabulary we want the final learned lexicon to cover (one word per line)."
+  echo " <data>         acoustic training data we use to get alternative"
+  echo "                pronunciations and collet acoustic evidence."
+  echo " <src-mdl-dir>  The dir containing an SAT-GMM acoustic model (we optionaly we re-train it" 
+  echo "                using G2P expanded lexicon) to do phonetic decoding (to get alternative"
+  echo "                pronunciations) and lattice-alignment (to collect acoustic evidence for"
+  echo "                evaluating all prounciations)"
+  echo " <ref-lang>     The reference lang dir which we use to get non-scored-words"
+  echo "                like <UNK> for building new dict dirs"
+  echo " <dest-dict>    The dict dir where we put the final learned lexicon, whose vocab"
+  echo "                matches <target-vocab>."
+  echo " <dir>          The dir which contains all the intermediate outputs of this script."
+  echo ""
+  echo "Note: <target-vocab> and the vocab of <data> don't have to match. For words"
+  echo "     who are in <target-vocab> but not seen in <data>, their pronunciations" 
+  echo "     will be given by G2P at the end."
+  echo ""
+  echo "e.g. $0 data/local/dict data/local/lm/librispeech-vocab.txt data/train \\"
+  echo "          exp/tri3 data/lang data/local/dict_learned"
+  echo "Options:"
+  echo "  --stage <n>                         # stage to run from, to enable resuming from partially"
+  echo "                                      # completed run (default: 0)"
+  echo "  --cmd '$cmd'                        # command to submit jobs with (e.g. run.pl, queue.pl)"
+  echo "  --nj <nj>                           # number of parallel jobs"
+  echo "  --oov-symbol '$oov_symbol'          # oov symbol, like <UNK>."
+  echo "  --lexiconp-g2p                      # a lexicon (with prob in the second column) file containing g2p generated"
+  echo "                                      # pronunciations, for words in acoustic training data / target vocabulary. It's optional."
+  echo "  --min-prob <float>                  # The cut-off parameter used to select pronunciation candidates from phonetic"
+  echo "                                      # decoding. We remove pronunciations with probabilities less than this value"
+  echo "                                      # after normalizing the probs s.t. the max-prob is 1.0 for each word."
+  echo "  --variant-counts-ratio <int>        # This ratio parameter determines the maximum number of pronunciation"
+  echo "                                      # candidates we will keep for each word, after pruning according to lattice statistics from"
+  echo "                                      # the first iteration of lattice generation. See steps/dict/internal/prune_pron_candidates.py"
+  echo "                                      # for details."
+  echo "  --variant-counts-no-acoustics <int> # how many g2p-prons per word we want to include for each words unseen in acoustic training data."
+  echo "  --alpha <float>,<float>,<float>     # scaling factors used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --beta <int>,<int>,<int>            # smoothing factors used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --delta <float>                     # a floor value used in the greedy pronunciation selection framework, "
+  echo "                                      # see steps/dict/select_prons_greedy.py for details."
+  echo "  --num-gauss                         # number of gaussians for the re-trained SAT model (on top of <src-mdl-dir>)."            
+  echo "  --num-leaves                        # number of leaves for the re-trained SAT model (on top of <src-mdl-dir>)." 
+  echo "  --retrain-src-mdl                   # true if you want to re-train the src_mdl before phone decoding (default false)."
+  exit 1
+fi
+
+echo "$0 $@"  # Print the command line for logging
+
+ref_dict=$1
+target_vocab=$2
+data=$3
+src_mdl_dir=$4
+ref_lang=$5
+dest_dict=$6
+
+if [ -z "$oov_symbol" ]; then
+   echo "$0: the --oov-symbol option is required."
+   exit 1
+fi
+
+if [ $# -gt 6 ]; then
+  dir=$7 # Most intermediate outputs will be put here. 
+else
+  dir=${src_mdl_dir}_lex_learn_work
+fi
+
+mkdir -p $dir
+if [ $stage -le 0 ]; then
+  echo "$0: Some preparatory work."
+  # Get the word counts of training data.
+  awk '{for (n=2;n<=NF;n++) counts[$n]++;} END{for (w in counts) printf "%s %d\n",w, counts[w];}' \
+    $data/text | sort > $dir/train_counts.txt
+  
+  # Get the non-scored entries and exclude them from the reference lexicon/vocab, and target_vocab.
+  steps/cleanup/internal/get_non_scored_words.py $ref_lang > $dir/non_scored_words
+  awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt > $dir/non_scored_entries 
+
+  # Remove non-scored-words from the reference lexicon.
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $ref_dict/lexicon.txt | tr -s '\t' ' ' | awk '$1=$1' > $dir/ref_lexicon.txt
+
+  cat $dir/ref_lexicon.txt | awk '{print $1}' | sort | uniq > $dir/ref_vocab.txt
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' $dir/non_scored_words \
+    $target_vocab | sort | uniq > $dir/target_vocab.txt
+    
+  # From the reference lexicon, we estimate the target_num_prons_per_word as,
+  # round(avg. # prons per word in the reference lexicon). This'll be used as 
+  # the upper bound of # pron variants per word when we apply G2P or select prons to
+  # construct the learned lexicon in later stages.
+  python -c 'import sys; import math; print int(round(float(sys.argv[1])/float(sys.argv[2])))' \
+    `wc -l $dir/ref_lexicon.txt | awk '{print $1}'` `wc -l $dir/ref_vocab.txt | awk '{print $1}'` \
+    > $dir/target_num_prons_per_word || exit 1;
+
+  if [ -z $lexiconp_g2p ]; then
+    # create an empty list of g2p generated prons, if it's not given.
+    touch $dir/lexicon_g2p.txt
+    touch $dir/lexiconp_g2p.txt
+  else
+    # Exchange the 1st column (word) and 2nd column (prob) and remove pronunciations
+    # which are already in the reference lexicon.
+    cat $lexiconp_g2p | awk '{a=$1;b=$2; $1="";$2="";print b" "a$0}' | \
+      awk 'NR==FNR{a[$0] = 1; next} {w=$2;for (n=3;n<=NF;n++) w=w" "$n; if(!(w in a)) print $0}' \
+      $dir/ref_lexicon.txt - > $dir/lexiconp_g2p.txt 2>/dev/null
+    
+    # make a copy where we remove the first column (probabilities).
+    cat $dir/lexiconp_g2p.txt | cut -f1,3- > $dir/lexicon_g2p.txt 2>/dev/null
+  fi
+  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
+  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
+    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
+fi
+
+if [ $stage -le 1 ] && $retrain_src_mdl; then
+  echo "$0: Expand the reference lexicon to cover all words in the target vocab. and then"
+  echo "   ... re-train the source acoustic model for phonetic decoding. "
+  mkdir -p $dir/dict_expanded_target_vocab
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_target_vocab  2>/dev/null
+  rm $dir/dict_expanded_target_vocab/lexiconp.txt $dir/dict_expanded_target_vocab/lexicon.txt 2>/dev/null
+  
+  # Get the oov words list (w.r.t ref vocab) which are in the target vocab. 
+  awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' $dir/ref_lexicon.txt \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_target_vocab.txt
+
+  # Assign pronunciations from lexicon_g2p.txt to oov_target_vocab. For words which
+  # cannot be found in lexicon_g2p.txt, we simply ignore them.
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_target_vocab.txt \
+    $dir/lexicon_g2p.txt > $dir/lexicon_g2p_oov_target_vocab.txt
+  
+  cat $dir/lexicon_g2p_oov_target_vocab.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt - | \
+    cat $dir/non_scored_entries - | 
+    sort | uniq > $dir/dict_expanded_target_vocab/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt $dir/dict_expanded_target_vocab \
+    $oov_symbol $dir/lang_expanded_target_vocab_tmp $dir/lang_expanded_target_vocab || exit 1;
+  
+  # Align the acoustic training data using the given src_mdl_dir.
+  alidir=${src_mdl_dir}_ali_$(basename $data) 
+  steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
+    $data $dir/lang_expanded_target_vocab $src_mdl_dir $alidir || exit 1;
+  
+  # Train another SAT system on the given data and put it in $dir/${src_mdl_dir}_retrained
+  # this model will be used for phonetic decoding and lattice alignment later on.
+  if [ -z $num_leaves ] || [ -z $num_gauss ] ; then
+    echo "num_leaves and num_gauss need to be specified." && exit 1;
+  fi
+  steps/train_sat.sh --cmd "$train_cmd" $num_leaves $num_gauss \
+    $data $dir/lang_expanded_target_vocab $alidir $dir/${src_mdl_dir}_retrained || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Expand the reference lexicon to cover all words seen in,"
+  echo "  ... acoustic training data, and prepare corresponding dict and lang directories."
+  echo "  ... This is needed when generate pron candidates from phonetic decoding."
+  mkdir -p $dir/dict_expanded_train
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_expanded_train 2>/dev/null
+  rm $dir/dict_expanded_train/lexiconp.txt $dir/dict_expanded_train/lexicon.txt 2>/dev/null
+
+  # Get the oov words list (w.r.t ref vocab) which are in training data. 
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_lexicon.txt \
+    $dir/train_counts.txt | awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $0}' \
+    $dir/non_scored_words - | sort > $dir/oov_train.txt || exit 1; 
+  
+  awk 'NR==FNR{a[$1] = 1; next} {if(($1 in a)) b+=$2; else c+=$2} END{print c/(b+c)}' \
+    $dir/ref_vocab.txt $dir/train_counts.txt > $dir/train_oov_rate || exit 1;
+  
+  echo "OOV rate (w.r.t. the reference lexicon) of the acoustic training data is:"
+  cat $dir/train_oov_rate
+
+  # Assign pronunciations from lexicon_g2p to oov_train. For words which
+  # cannot be found in lexicon_g2p, we simply assign oov_symbol's pronunciaiton
+  # (like NSN) to them, in order to get phonetic decoding pron candidates for them later on.
+  variant_counts=`cat $dir/target_num_prons_per_word` || exit 1;
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_train.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_train.txt || exit 1;
+  
+  # Get the pronunciation of oov_symbol.
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'`
+  # For oov words in training data for which we don't even have G2P pron candidates,
+  # we simply assign them the pronunciation of the oov symbol (like <unk>),
+  # so that we can get pronunciations for them from phonetic decoding.
+  awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_train.txt \
+    $dir/oov_train.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_train_no_pron.txt || exit 1;
+    
+  cat $dir/oov_train_no_pron.txt $dir/g2p_prons_for_oov_train.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat - $dir/non_scored_entries | \
+    sort | uniq > $dir/dict_expanded_train/lexicon.txt || exit 1;
+  
+  utils/prepare_lang.sh $dir/dict_expanded_train $oov_symbol \
+    $dir/lang_expanded_train_tmp $dir/lang_expanded_train || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Generate pronunciation candidates from phonetic decoding on acoustic training data.."
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/cleanup/debug_lexicon.sh  --nj $nj \
+    --cmd "$decode_cmd" $data $dir/lang_expanded_train \
+    $mdl_dir $dir/dict_expanded_train/lexicon.txt $dir/phonetic_decoding || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  echo "$0: Combine the reference lexicon and pronunciations from phone-decoding/G2P into one"
+  echo "  ... lexicon, and run lattice alignment using this lexicon on acoustic training data"
+  echo "  ... to collect acoustic evidence."
+  # We first prune the phonetic decoding generated prons relative to the largest count, by setting "min_prob",
+  # and only leave prons who are not present in the reference lexicon / g2p-generated lexicon.
+  cat $dir/ref_lexicon.txt $dir/lexicon_g2p.txt | sort -u > $dir/phonetic_decoding/filter_lexicon.txt 
+  
+  $cmd $dir/phonetic_decoding/log/prons_to_lexicon.log steps/dict/prons_to_lexicon.py \
+    --min-prob=$min_prob --filter-lexicon=$dir/phonetic_decoding/filter_lexicon.txt \
+    $dir/phonetic_decoding/prons.txt $dir/lexicon_pd_with_eps.txt
+
+  # We abandon phonetic-decoding candidates for infrequent words.
+  awk '{if($2 < 3) print $1}' $dir/train_counts.txt > $dir/pd_candidates_to_exclude.txt 
+  awk 'NR==FNR{a[$1] = $2; next} {if(a[$1]<10) print $1}' $dir/train_counts.txt \
+    $dir/oov_train_no_pron.txt >> $dir/pd_candidates_to_exclude.txt 
+
+  if [ -s $dir/pd_candidates_to_exclude.txt ]; then
+    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
+      awk 'NR==FNR{a[$0] = 1; next} {if(!($1 in a)) print $0}' $dir/pd_candidates_to_exclude.txt - | \
+      sort | uniq > $dir/lexicon_pd.txt || exit 1;
+  else
+    cat $dir/lexicon_pd_with_eps.txt | grep -vP "<eps>|<UNK>|<unk>|\[.*\]" | \
+      sort | uniq > $dir/lexicon_pd.txt || exit 1;
+  fi
+
+  # Combine the reference lexicon, pronunciations from G2P and phonetic decoding into one lexicon.
+  mkdir -p $dir/dict_combined_iter1
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter1/ 2>/dev/null
+  rm $dir/dict_combined_iter1/lexiconp.txt $dir/dict_combined_iter1/lexicon.txt 2>/dev/null
+
+  # Filter out words which don't appear in the acoustic training data
+  cat $dir/lexicon_pd.txt $dir/lexicon_g2p.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter1/lexicon.txt
+  
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter1 $oov_symbol \
+    $dir/lang_combined_iter1_tmp $dir/lang_combined_iter1 || exit 1;
+  
+  # Generate lattices for the acoustic training data with the combined lexicon.
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+
+  # Get the vocab for words for which we want to learn pronunciations.
+  if $learn_iv_prons; then
+    # If we want to learn the prons of IV words (w.r.t. ref_vocab), the learn_vocab is just the intersection of
+    # target_vocab and the vocab of words seen in acoustic training data (first col. of train_counts.txt)
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt \
+      > $dir/learn_vocab.txt
+  else
+    # Exclude words from the ref_vocab if we don't want to learn the pronunciations of IV words.
+    awk 'NR==FNR{a[$1] = 1; next} {if($1 in a) print $1}' $dir/target_vocab.txt $dir/train_counts.txt | \
+      awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/ref_vocab.txt - > $dir/learn_vocab.txt
+  fi
+  
+  # In order to get finer lattice stats of alternative prons, we want to make lattices deeper.
+  # To speed up lattice generation, we use a ctm to create sub-utterances and a sub-segmentation
+  # for each instance of a word within learn_vocab (or a string of consecutive words within learn_vocab),
+  # including a single out-of-learn-vocab word at the boundary if present.
+  mkdir -p $dir/resegmentation
+  steps/dict/internal/get_subsegments.py $dir/phonetic_decoding/word.ctm $dir/learn_vocab.txt \
+    $dir/resegmentation/subsegments $dir/resegmentation/text || exit 1;
+  utils/data/subsegment_data_dir.sh $data $dir/resegmentation/subsegments $dir/resegmentation/text \
+    $dir/resegmentation/data || exit 1;
+  steps/compute_cmvn_stats.sh $dir/resegmentation/data || exit 1;
+
+  steps/align_fmllr_lats.sh --beam 20 --retry-beam 50 --final-beam 30 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
+    $dir/resegmentation/data $dir/lang_combined_iter1 $mdl_dir $dir/lats_iter1 || exit 1;
+
+  # Get arc level information from the lattice.
+  $cmd JOB=1:$nj $dir/lats_iter1/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter1/phones/word_boundary.int \
+    $dir/lats_iter1/final.mdl \
+    "ark:gunzip -c $dir/lats_iter1/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter1/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter1/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter1/phones.txt '>' \
+    $dir/lats_iter1/arc_info_sym.JOB.txt || exit 1;
+  
+  # Compute soft counts (pron_stats) of every particular word-pronunciation pair by
+  # summing up arc level information over all utterances. We'll use this to prune
+  # pronunciation candidates before the next iteration of lattice generation.
+  cat $dir/lats_iter1/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/pron_stats.txt || exit 1;
+  
+  # Accumlate utterance-level pronunciation posteriors (into arc_stats) by summing up
+  # posteriors of arcs representing the same word & pronunciation and starting
+  # from roughly the same location. See steps/dict/internal/sum_arc_info.py for details.
+  for i in `seq 1 $nj`;do
+    cat $dir/lats_iter1/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
+      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter1/arc_info_summed.${i}.txt
+  done 
+  cat $dir/lats_iter1/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter1/arc_stats.txt 
+
+  # Prune the phonetic_decoding lexicon so that any pronunciation that only has non-zero posterior at one word example will be removed.
+  # The pruned lexicon is put in $dir/lats_iter1. After further pruning in the next stage it'll be put back to $dir.
+  awk 'NR==FNR{w=$1;for (n=5;n<=NF;n++) w=w" "$n;a[w]+=1;next} {if($0 in a && a[$0]>1) print $0}' \
+    $dir/lats_iter1/arc_stats.txt $dir/lexicon_pd.txt > $dir/lats_iter1/lexicon_pd_pruned.txt
+fi
+
+# Here we re-generate lattices (with a wider beam and a pruned combined lexicon) and re-collect pronunciation statistics 
+if [ $stage -le 5 ]; then
+  echo "$0: Prune the pronunciation candidates generated from G2P/phonetic decoding, and re-do lattice-alignment."
+  mkdir -p $dir/dict_combined_iter2
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dir/dict_combined_iter2/ 2>/dev/null
+  rm $dir/dict_combined_iter2/lexiconp.txt $dir/dict_combined_iter2/lexicon.txt 2>/dev/null
+
+  # Prune away pronunciations which have low acoustic evidence from the first pass of lattice generation.
+  $cmd $dir/lats_iter1/log/prune_pron_candidates.log steps/dict/internal/prune_pron_candidates.py \
+    --variant-counts-ratio $variant_counts_ratio \
+    $dir/lats_iter1/pron_stats.txt $dir/lats_iter1/lexicon_pd_pruned.txt $dir/lexiconp_g2p.txt $dir/ref_lexicon.txt \
+    $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt
+
+  # Filter out words which don't appear in the acoustic training data.
+  cat $dir/lexicon_pd_pruned.txt $dir/lexicon_g2p_pruned.txt \
+    $dir/ref_lexicon.txt | tr -s '\t' ' ' | \
+    awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/train_counts.txt - | \
+    cat $dir/non_scored_entries - | \
+    sort | uniq > $dir/dict_combined_iter2/lexicon.txt
+
+  utils/prepare_lang.sh --phone-symbol-table $ref_lang/phones.txt \
+    $dir/dict_combined_iter2 $oov_symbol \
+    $dir/lang_combined_iter2_tmp $dir/lang_combined_iter2 || exit 1;
+  
+  # Re-generate lattices with a wider beam, so that we'll get deeper lattices.
+  if $retrain_src_mdl; then mdl_dir=$dir/${src_mdl_dir}_retrained; else mdl_dir=$src_mdl_dir; fi
+  steps/align_fmllr_lats.sh  --beam 30 --retry-beam 60 --final-beam 50 --acoustic-scale 0.05 --cmd "$decode_cmd" --nj $nj \
+    $dir/resegmentation/data $dir/lang_combined_iter2 $mdl_dir $dir/lats_iter2 || exit 1;
+
+  # Get arc level information from the lattice as we did in the last stage.
+  $cmd JOB=1:$nj $dir/lats_iter2/log/get_arc_info.JOB.log \
+    lattice-align-words $dir/lang_combined_iter2/phones/word_boundary.int \
+    $dir/lats_iter2/final.mdl \
+    "ark:gunzip -c $dir/lats_iter2/lat.JOB.gz |" ark:- \| \
+    lattice-arc-post --acoustic-scale=0.1 $dir/lats_iter2/final.mdl ark:- - \| \
+    utils/int2sym.pl -f 5 $dir/lang_combined_iter2/words.txt \| \
+    utils/int2sym.pl -f 6- $dir/lang_combined_iter2/phones.txt '>' \
+    $dir/lats_iter2/arc_info_sym.JOB.txt || exit 1;
+  
+  # Compute soft counts (pron_stats) of every particular word-pronunciation pair as
+  # we did in the last stage. The stats will only be used as diagnostics.
+  cat $dir/lats_iter2/arc_info_sym.*.txt | steps/dict/get_pron_stats.py - \
+    $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/pron_stats.txt || exit 1;
+  
+  # Accumlate utterance-level pronunciation posteriors as we did in the last stage.
+  for i in `seq 1 $nj`;do
+    cat $dir/lats_iter2/arc_info_sym.${i}.txt | sort -n -k1 -k2 -k3r | \
+      steps/dict/internal/sum_arc_info.py - $dir/phonetic_decoding/phone_map.txt $dir/lats_iter2/arc_info_summed.${i}.txt
+  done 
+  cat $dir/lats_iter2/arc_info_summed.*.txt | sort -k1 -k2 > $dir/lats_iter2/arc_stats.txt 
+
+  # The pron_stats are the acoustic evidence which the likelihood-reduction-based pronunciation
+  # selection procedure will be based on.
+  # Split the utterance-level pronunciation posterior stats into $nj_select_prons pieces,
+  # so that the following pronunciation selection stage can be parallelized.
+  numsplit=$nj_select_prons
+  awk '{print $1"-"$2" "$1}' $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/utt2word
+  utt2words=$(for n in `seq $numsplit`; do echo $dir/lats_iter2/utt2word.$n; done)
+  utils/split_scp.pl --utt2spk=$dir/lats_iter2/utt2word $dir/lats_iter2/utt2word $utt2words || exit 1
+  for n in `seq $numsplit`; do 
+    (cat $dir/lats_iter2/utt2word.$n | awk '{$1=substr($1,length($2)+2);print $2" "$1}' - > $dir/lats_iter2/word2utt.$n
+     awk 'NR==FNR{a[$0] = 1; next} {b=$1" "$2; if(b in a) print $0}' $dir/lats_iter2/word2utt.$n \
+       $dir/lats_iter2/arc_stats.txt > $dir/lats_iter2/arc_stats.${n}.txt
+    ) &
+  done
+  wait
+fi
+
+if [ $stage -le 6 ]; then
+  echo "$0: Select pronunciations according to the acoustic evidence from lattice alignment."
+  # Given the acoustic evidence (soft-counts), we use a Bayesian framework to select pronunciations 
+  # from three exclusive candidate sources: reference (hand-derived) lexicon, G2P and phonetic decoding.
+  # The posteriors for all candidate prons for all words are printed into pron_posteriors.txt
+  # For words which are out of the ref. vocab, the learned prons are written into out_of_ref_vocab_prons_learned.txt.
+  # Among them, for words without acoustic evidence, we just ignore them, even if pron candidates from G2P were provided).
+  # For words in the ref. vocab, we instead output a human readable & editable "edits" file called
+  # ref_lexicon_edits.txt, which records all proposed changes to the prons (if any). Also, a 
+  # summary is printed into the log file.
+  
+  $cmd JOB=1:$nj_select_prons $dir/lats_iter2/log/generate_learned_lexicon.JOB.log \
+    steps/dict/select_prons_greedy.py \
+      --alpha=${alpha} --beta=${beta} \
+      --delta=${delta} \
+      $ref_dict/silence_phones.txt $dir/lats_iter2/arc_stats.JOB.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
+      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
+      $dir/lats_iter2/learned_lexicon.JOB.txt || exit 1;
+
+  cat $dir/lats_iter2/learned_lexicon.*.txt > $dir/lats_iter2/learned_lexicon.txt
+  rm $dir/lats_iter2/learned_lexicon.*.txt
+
+  $cmd $dir/lats_iter2/log/lexicon_learning_summary.log \
+    steps/dict/merge_learned_lexicons.py \
+      $dir/lats_iter2/arc_stats.txt $dir/train_counts.txt $dir/ref_lexicon.txt \
+      $dir/lexicon_g2p_pruned.txt $dir/lexicon_pd_pruned.txt \
+      $dir/lats_iter2/learned_lexicon.txt \
+      $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/lats_iter2/ref_lexicon_edits.txt || exit 1;
+
+  cp $dir/lats_iter2/ref_lexicon_edits.txt $dir/lats_iter2/ref_lexicon_edits.txt
+  # Remove some stuff that takes up space and is unlikely to be useful later on.
+  if $cleanup; then
+    rm -r $dir/lats_iter*/{fsts*,lat*} 2>/dev/null
+  fi
+fi
+
+if [ $stage -le 7 ]; then
+  echo "$0: Expand the learned lexicon further to cover words in target vocab that are."
+  echo "  ... not seen in acoustic training data."
+  mkdir -p $dest_dict
+  cp $ref_dict/{extra_questions.txt,optional_silence.txt,nonsilence_phones.txt,silence_phones.txt} \
+    $dest_dict  2>/dev/null
+  rm $dest_dict/lexiconp.txt $dest_dict/lexicon.txt 2>/dev/null
+  # Get the list of oov (w.r.t. ref vocab) without acoustic evidence, which are in the
+  # target vocab. We'll just assign to them pronunciations from lexicon_g2p, if any.
+  cat $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt $dir/ref_lexicon.txt | \
+    awk 'NR==FNR{a[$1] = 1; next} !($1 in a)' - \
+    $dir/target_vocab.txt | sort | uniq > $dir/oov_no_acoustics.txt || exit 1;
+  
+  variant_counts=$variant_counts_no_acoustics
+  
+  $cmd $dir/log/prune_g2p_lexicon.log steps/dict/prons_to_lexicon.py \
+    --top-N=$variant_counts $dir/lexiconp_g2p.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt || exit 1;
+  
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/oov_no_acoustics.txt \
+    $dir/lexicon_g2p_variant_counts${variant_counts}.txt > $dir/g2p_prons_for_oov_no_acoustics.txt|| exit 1;
+
+  # Get the pronunciation of oov_symbol.
+  oov_pron=`cat $dir/non_scored_entries | grep $oov_symbol | awk '{print $2}'` || exit 1;
+  # For oov words in target_vocab for which we don't even have G2P pron candidates,
+  # we simply assign them the pronunciation of the oov symbol (like <unk>),
+  if [ -s $dir/g2p_prons_for_oov_no_acoustics.txt ]; then
+    awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $dir/g2p_prons_for_oov_no_acoustics.txt \
+      $dir/oov_no_acoustics.txt | awk -v op="$oov_pron" '{print $0" "op}' > $dir/oov_target_vocab_no_pron.txt || exit 1;
+  else
+    awk -v op="$oov_pron" '{print $0" "op}' $dir/oov_no_acoustics.txt > $dir/oov_target_vocab_no_pron.txt || exit 1
+  fi
+
+  # We concatenate three lexicons togethers: G2P lexicon for oov words without acoustics,
+  # learned lexicon for oov words with acoustics, and the original reference lexicon (for
+  # this part, later one we'll apply recommended changes using steps/dict/apply_lexicon_edits.py
+  cat $dir/g2p_prons_for_oov_no_acoustics.txt $dir/lats_iter2/out_of_ref_vocab_prons_learned.txt \
+    $dir/oov_target_vocab_no_pron.txt $dir/ref_lexicon.txt | tr -s '\t' ' ' | sort | uniq > $dest_dict/lexicon.temp
+
+  awk 'NR==FNR{a[$1] = 1; next} ($1 in a)' $dir/target_vocab.txt \
+    $dest_dict/lexicon.temp | sort | uniq > $dest_dict/lexicon.nosil
+
+  cat $dir/non_scored_entries $dest_dict/lexicon.nosil | sort | uniq >$dest_dict/lexicon0.txt
+fi
+
+if [ $stage -le 8 ]; then
+  echo "$0: Apply the ref_lexicon_edits file to the reference lexicon."
+  echo "  ... The user can inspect/modify the edits file and then re-run:"
+  echo "  ... steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt  - | \\"
+  echo "  ...   sort -u \> $dest_dict/lexicon.txt to re-produce the final learned lexicon."
+  cp $dir/lats_iter2/ref_lexicon_edits.txt $dest_dict/lexicon_edits.txt 2>/dev/null
+  steps/dict/apply_lexicon_edits.py $dest_dict/lexicon0.txt $dir/lats_iter2/ref_lexicon_edits.txt - | \
+    sort | uniq > $dest_dict/lexicon.txt || exit 1;
+fi
+
+echo "Lexicon learning ends successfully. Please refer to $dir/lats_iter2/log/lexicon_learning_summary.log"
+echo "  for a summary. The learned lexicon, whose vocab matches the target_vocab, is $dest_dict/lexicon.txt"
diff --git a/egs/wsj/s5/steps/dict/merge_learned_lexicons.py b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py
new file mode 100755
index 00000000000..6df7eb7a744
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/merge_learned_lexicons.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Convert a learned lexicon produced by steps/dict/select_prons_greedy.sh"
+        "into a lexicon for OOV words (w.r.t. ref. vocab) and a human editable lexicon-edit file."
+        "for in-vocab words, and generate detailed summaries of the lexicon learning results"
+        "The inputs are a learned lexicon, an arc-stats file, and three source lexicons "
+        "(phonetic-decoding(PD)/G2P/ref). The outputs are: a learned lexicon for OOVs"
+        "(learned_lexicon_oov), and a lexicon_edits file (ref_lexicon_edits) containing"
+        "suggested modifications of prons, for in-vocab words.",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
+    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
+                        help = "File containing word-pronunciation statistics obtained from lattices; "
+                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
+    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
+                        help = "File containing word counts in acoustic training data; "
+                        "each line must be <word> <count>.")
+    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
+                        help = "The reference lexicon (most probably hand-derived)."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
+                        help = "Candidate ronouciations from G2P results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pd_lexicon", metavar = "<prons-in-acoustic-evidence>", type = str,
+                        help = "Candidate ronouciations from phonetic decoding results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
+                        help = "Learned lexicon."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon_oov", metavar = "<learned-lexicon-oov>", type = str,
+                        help = "Output file which is the learned lexicon for words out of the ref. vocab.")
+    parser.add_argument("ref_lexicon_edits", metavar = "<lexicon-edits>", type = str,
+                        help = "Output file containing human-readable & editable pronounciation info (and the"
+                        "accept/reject decision made by our algorithm) for those words in ref. vocab," 
+                        "to which any change has been recommended. The info for each word is like:" 
+                        "------------ an 4086.0 --------------"
+                        "R  | Y |  2401.6 |  AH N"
+                        "R  | Y |  640.8 |  AE N"
+                        "P  | Y |  1035.5 |  IH N"
+                        "R(ef), P(hone-decoding) represents the pronunciation source"
+                        "Y/N means the recommended decision of including this pron or not"
+                        "and the numbers are soft counts accumulated from lattice-align-word outputs. "
+                        "See the function WriteEditsAndSummary for more details.")
+ 
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    if args.arc_stats_file == "-":
+        args.arc_stats_file_handle = sys.stdin
+    else:
+        args.arc_stats_file_handle = open(args.arc_stats_file)
+    args.word_counts_file_handle = open(args.word_counts_file)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    args.g2p_lexicon_handle = open(args.g2p_lexicon)
+    args.pd_lexicon_handle = open(args.pd_lexicon)
+    args.learned_lexicon_handle = open(args.learned_lexicon)
+    args.learned_lexicon_oov_handle = open(args.learned_lexicon_oov, "w")
+    args.ref_lexicon_edits_handle = open(args.ref_lexicon_edits, "w")
+    
+    return args
+
+def ReadArcStats(arc_stats_file_handle):
+    stats = defaultdict(lambda : defaultdict(dict))
+    stats_summed = defaultdict(float)
+    for line in arc_stats_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 5):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + arc_stats_file)
+        utt = splits[1]
+        start_frame = int(splits[2])
+        word = splits[0]
+        count = float(splits[3])
+        phones = splits[4:]
+        phones = ' '.join(phones)
+        stats[word][(utt, start_frame)][phones] = count
+        stats_summed[(word, phones)] += count
+    return stats, stats_summed
+
+def ReadWordCounts(word_counts_file_handle):
+    counts = {}
+    for line in word_counts_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in counts file.')
+        word = splits[0]
+        count = int(splits[1])
+        counts[word] = count
+    return counts
+
+def ReadLexicon(args, lexicon_file_handle, counts):
+    # we're skipping any word not in counts (not seen in training data),
+    # cause we're only learning prons for words who have acoustic examples.
+    lexicon = defaultdict(set)
+    for line in lexicon_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        if word not in counts:
+            continue
+        phones = ' '.join(splits[1:])
+        lexicon[word].add(phones)
+    return lexicon
+
+def WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed):
+    # Note that learned_lexicon and ref_lexicon are dicts of sets of prons, while the other two lexicons are sets of (word, pron) pairs.
+    threshold = 2
+    words = [defaultdict(set) for i in range(4)] # "words" contains four bins, where we
+    # classify each word into, according to whether it's count > threshold,
+    # and whether it's OOVs w.r.t the reference lexicon.
+
+    src = {}
+    print("# Note: This file contains pronunciation info for words who have candidate "
+          "prons from G2P/phonetic-decoding accepted in the learned lexicon"
+          ", sorted by their counts in acoustic training data, "
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 1st Col: source of the candidate pron: G(2P) / P(hone-decoding) / R(eference)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 2nd Col: accepted or not in the learned lexicon (Y/N).", file=args.ref_lexicon_edits_handle)
+    print("# 3rd Col: soft counts from lattice-alignment (not augmented by prior-counts)."
+          ,file=args.ref_lexicon_edits_handle)
+    print("# 4th Col: the pronunciation cadidate.", file=args.ref_lexicon_edits_handle)
+    
+    # words which are to be printed into the edits file.
+    words_to_edit = [] 
+    num_prons_tot = 0
+    for word in learned_lexicon:
+        num_prons_tot += len(learned_lexicon[word])
+        count = len(stats[word]) # This count could be smaller than the count read from the dict "counts",
+        # since in each sub-utterance, multiple occurences (which is rare) of the same word are compressed into one.
+        # We use this count here so that in the edit-file, soft counts for each word sum up to one. 
+        flags = ['0' for i in range(3)] # "flags" contains three binary indicators, 
+        # indicating where this word's pronunciations come from.
+        for pron in learned_lexicon[word]:
+            if word in pd_lexicon and pron in pd_lexicon[word]:
+                flags[0] = '1'
+                src[(word, pron)] = 'P'
+            elif word in ref_lexicon and pron in ref_lexicon[word]:
+                flags[1] = '1'
+                src[(word, pron)] = 'R'
+            elif word in g2p_lexicon and pron in g2p_lexicon[word]:
+                flags[2] = '1'
+                src[(word, pron)] = 'G'
+        if word in ref_lexicon:
+            all_ref_prons_accepted = True
+            for pron in ref_lexicon[word]:
+                if pron not in learned_lexicon[word]:
+                    all_ref_prons_accepted = False
+                    break
+            if not all_ref_prons_accepted or flags[0] == '1' or flags[2] == '1':
+                words_to_edit.append((word, len(stats[word])))
+            if count > threshold:
+                words[0][flags[0] + flags[1] + flags[2]].add(word)
+            else:
+                words[1][flags[0] + flags[1] + flags[2]].add(word)
+        else:
+            if count > threshold: 
+                words[2][flags[0] + flags[2]].add(word)
+            else:
+                words[3][flags[0] + flags[2]].add(word)
+
+    words_to_edit_sorted = sorted(words_to_edit, key=lambda entry: entry[1], reverse=True)
+    for word, count in words_to_edit_sorted:
+        print("------------",word, "%2.1f" % count, "--------------", file=args.ref_lexicon_edits_handle)
+        learned_prons = []
+        for pron in learned_lexicon[word]:
+            learned_prons.append((src[(word, pron)], 'Y', stats_summed[(word, pron)], pron))
+        for pron in ref_lexicon[word]:
+            if pron not in learned_lexicon[word]:
+                learned_prons.append(('R', 'N', stats_summed[(word, pron)], pron))
+        learned_prons_sorted = sorted(learned_prons, key=lambda item: item[2], reverse=True)
+        for item in learned_prons_sorted:
+            print('{} | {} |  {:.2f} | {}'.format(item[0], item[1], item[2], item[3]), file=args.ref_lexicon_edits_handle)
+
+    num_oovs_with_acoustic_evidence = len(set(learned_lexicon.keys()).difference(set(ref_lexicon.keys())))
+    num_oovs = len(set(counts.keys()).difference(set(ref_lexicon.keys())))
+    num_ivs = len(learned_lexicon) - num_oovs_with_acoustic_evidence
+    print("Average num. prons per word in the learned lexicon is {}".format(float(num_prons_tot)/float(len(learned_lexicon))), file=sys.stderr)
+    # print("Here are the words whose reference pron candidates were all declined", words[0]['100'], file=sys.stderr)
+    print("-------------------------------------------------Summary------------------------------------------", file=sys.stderr)
+    print("We have acoustic evidence for {} out of {} in-vocab (w.r.t the reference lexicon) words from the acoustic training data.".format(num_ivs, len(ref_lexicon)), file=sys.stderr) 
+    print("  Among those frequent words whose counts in the training text > ", threshold, ":", file=sys.stderr) 
+    num_freq_ivs_from_all_sources = len(words[0]['111']) + len(words[0]['110']) + len(words[0]['011'])
+    num_freq_ivs_from_g2p_or_phonetic_decoding = len(words[0]['101']) + len(words[0]['001']) + len(words[0]['100'])
+    num_freq_ivs_from_ref = len(words[0]['010'])
+    num_infreq_ivs_from_all_sources = len(words[1]['111']) + len(words[1]['110']) + len(words[1]['011'])
+    num_infreq_ivs_from_g2p_or_phonetic_decoding = len(words[1]['101']) + len(words[1]['001']) + len(words[1]['100'])
+    num_infreq_ivs_from_ref = len(words[1]['010'])
+    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_freq_ivs_from_all_sources), file=sys.stderr)
+    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_freq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_freq_ivs_from_ref), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon, G2P/phonetic-decoding.'.format(num_infreq_ivs_from_all_sources), file=sys.stderr)
+    print('    {} words\' selected prons come from G2P/phonetic-decoding-generated.'.format(num_infreq_ivs_from_g2p_or_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from the reference lexicon only.'.format(num_infreq_ivs_from_ref), file=sys.stderr) 
+    print("---------------------------------------------------------------------------------------------------", file=sys.stderr)
+    num_freq_oovs_from_both_sources = len(words[2]['11'])
+    num_freq_oovs_from_phonetic_decoding = len(words[2]['10'])
+    num_freq_oovs_from_g2p = len(words[2]['01'])
+    num_infreq_oovs_from_both_sources = len(words[3]['11'])
+    num_infreq_oovs_from_phonetic_decoding = len(words[3]['10'])
+    num_infreq_oovs_from_g2p = len(words[3]['01'])
+    print('We have acoustic evidence for {} out of {} OOV (w.r.t the reference lexicon) words from the acoustic training data.'.format(num_oovs_with_acoustic_evidence, num_oovs), file=sys.stderr)
+    print('  Among those words whose counts in the training text > {}:'.format(threshold), file=sys.stderr)
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_freq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_freq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_freq_oovs_from_g2p), file=sys.stderr) 
+    print('  For those words whose counts in the training text <= {}:'.format(threshold), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P and phonetic-decoding.'.format(num_infreq_oovs_from_both_sources), file=sys.stderr)
+    print('    {} words\' selected prons came from phonetic decoding only.'.format(num_infreq_oovs_from_phonetic_decoding), file=sys.stderr) 
+    print('    {} words\' selected prons came from G2P only.'.format(num_infreq_oovs_from_g2p), file=sys.stderr) 
+
+def WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, file_handle):
+    for word, prons in learned_lexicon.iteritems():
+        if word not in ref_lexicon:
+            for pron in prons:
+                print('{0} {1}'.format(word, pron), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+
+    # Read in three lexicon sources, word counts, and pron stats.
+    counts = ReadWordCounts(args.word_counts_file_handle)
+    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
+    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
+    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
+    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
+    learned_lexicon =  ReadLexicon(args, args.learned_lexicon_handle, counts)
+    
+    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
+    WriteLearnedLexiconOov(learned_lexicon, ref_lexicon, args.learned_lexicon_oov_handle)
+    # Edits will be printed into ref_lexicon_edits, and the summary will be printed into stderr.
+    WriteEditsAndSummary(args, learned_lexicon, ref_lexicon, pd_lexicon, g2p_lexicon, counts, stats, stats_summed)
+
+if __name__ == "__main__":
+    Main()
diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
index 2a87d172602..a957b02d3d3 100755
--- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py
+++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
@@ -6,6 +6,7 @@
 
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
+from collections import defaultdict
 import argparse
 import sys
 
@@ -21,8 +22,8 @@ def __call__(self, parser, namespace, values, option_string=None):
             raise Exception("Unknown value {0} for --{1}".format(values, self.dest))
 
 def GetArgs():
-    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phone level decoding) "
-                                     "into a lexicon for lexicon learning. We prune the pronunciations "
+    parser = argparse.ArgumentParser(description = "Converts pronunciation statistics (from phonetic decoding or g2p) "
+                                     "into a lexicon for. We prune the pronunciations "
                                      "based on a provided stats file, and optionally filter out entries which are present "
                                      "in a filter lexicon.",
                                      epilog = "e.g. steps/dict/prons_to_lexicon.py --min-prob=0.4 \\"
@@ -39,6 +40,8 @@ def GetArgs():
                         action = StrToBoolAction, choices = ["true", "false"],
                         help = "If normalize lexicon such that the max "
                         "probability is 1.")
+    parser.add_argument("--top-N", type = int, default = 0,
+                        help = "If non-zero, we just take the top N pronunciations (according to stats/pron-probs) for each word.")
     parser.add_argument("--min-prob", type = float, default = 0.1,
                         help = "Remove pronunciation with probabilities less "
                         "than this value after normalization.")
@@ -46,8 +49,7 @@ def GetArgs():
                         help = "Exclude entries in this filter lexicon from the output lexicon."
                         "each line must be <word> <phones>")
     parser.add_argument("stats_file", metavar='<stats-file>', type = str,
-                        help = "Input file containing pronunciation statistics, representing how many times "
-                        "each word-pronunciation appear in the phonetic decoding results."
+                        help = "Input lexicon file containing pronunciation statistics/probs in the first column."
                         "each line must be <counts> <word> <phones>")
     parser.add_argument("out_lexicon", metavar='<out-lexicon>', type = str,
                         help = "Output lexicon.")
@@ -150,6 +152,18 @@ def NormalizeLexicon(lexicon, set_max_to_one = True,
             prob = 0
         lexicon[entry] = prob
 
+def TakeTopN(lexicon, top_N):
+    lexicon_reshaped = defaultdict(list) 
+    lexicon_pruned = {}
+    for entry, prob in lexicon.iteritems():
+        lexicon_reshaped[entry[0]].append([entry[1], prob])
+    for word in lexicon_reshaped:
+        prons = lexicon_reshaped[word]
+        sorted_prons = sorted(prons, reverse=True, key=lambda prons: prons[1])
+        for i in range(len(sorted_prons)):
+            if i >= top_N:
+                lexicon[(word, sorted_prons[i][0])] = 0
+        
 def WriteLexicon(args, lexicon, filter_lexicon):
     words = set()
     num_removed = 0
@@ -179,10 +193,15 @@ def Main():
     word_probs = ConvertWordCountsToProbs(args, lexicon, word_count)
 
     lexicon = ConvertWordProbsToLexicon(word_probs)
-    filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
-    NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
-                     set_sum_to_one = args.set_sum_to_one,
-                     min_prob = args.min_prob)
+    filter_lexicon = set()
+    if args.filter_lexicon is not '':
+        filter_lexicon = ReadLexicon(args.filter_lexicon_handle)
+    if args.top_N > 0:
+        TakeTopN(lexicon, args.top_N)
+    else:
+        NormalizeLexicon(lexicon, set_max_to_one = args.set_max_to_one,
+                         set_sum_to_one = args.set_sum_to_one,
+                         min_prob = args.min_prob)
     WriteLexicon(args, lexicon, filter_lexicon)
     args.out_lexicon_handle.close()
 
diff --git a/egs/wsj/s5/steps/dict/select_prons_greedy.py b/egs/wsj/s5/steps/dict/select_prons_greedy.py
new file mode 100755
index 00000000000..cf71070e134
--- /dev/null
+++ b/egs/wsj/s5/steps/dict/select_prons_greedy.py
@@ -0,0 +1,376 @@
+#!/usr/bin/env python
+
+# Copyright 2018  Xiaohui Zhang
+# Apache 2.0.
+
+from __future__ import print_function
+from collections import defaultdict
+import argparse
+import sys
+import math
+
+def GetArgs():
+    parser = argparse.ArgumentParser(
+        description = "Use a greedy framework to select pronunciation candidates"
+        "from three sources: a reference lexicon, G2P lexicon and phonetic-decoding"
+        "(PD) lexicon. Basically, this script implements the Alg. 1 in the paper:"
+        "Acoustic data-driven lexicon learning based on a greedy pronunciation "
+        "selection framework, by X. Zhang, V. Mahonar, D. Povey and S. Khudanpur,"
+        "Interspeech 2017. The inputs are an arc-stats file, containing "
+        "acoustic evidence (tau_{uwb} in the paper) and three source lexicons "
+        "(phonetic-decoding(PD)/G2P/ref). The outputs is the learned lexicon for"
+        "all words in the arc_stats (acoustic evidence) file.",
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example.")
+    parser.add_argument("--alpha", type = str, default = "0,0,0",
+                        help = "Scaling factors for the likelihood reduction threshold."
+                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
+                        "G2P and reference. The valid range of each dimension is [0, 1], and"
+                        "a large value means we prune pronunciations from this source more"
+                        "aggressively. Setting a dimension to zero means we never want to remove"
+                        "pronunciaiton from that source. See Section 4.3 in the paper for details.")
+    parser.add_argument("--beta", type = str, default = "0,0,0",
+                        help = "smoothing factors for the likelihood reduction term."
+                        "of three pronunciaiton candidate sources: phonetic-decoding (PD),"
+                        "G2P and reference. The valid range of each dimension is [0, 100], and"
+                        "a large value means we prune pronunciations from this source more"
+                        "aggressively. See Section 4.3 in the paper for details.")
+    parser.add_argument("--delta", type = float, default = 0.000000001,
+                        help = "Floor value of the pronunciation posterior statistics."
+                        "The valid range is (0, 0.01),"
+                        "See Section 3 in the paper for details.")
+    parser.add_argument("silence_phones_file", metavar = "<silphone-file>", type = str,
+                        help = "File containing a list of silence phones.")
+    parser.add_argument("arc_stats_file", metavar = "<arc-stats-file>", type = str,
+                        help = "File containing word-pronunciation statistics obtained from lattices; "
+                        "each line must be <word> <utt-id> <start-frame> <count> <phones>")
+    parser.add_argument("word_counts_file", metavar = "<counts-file>", type = str,
+                        help = "File containing word counts in acoustic training data; "
+                        "each line must be <word> <count>.")
+    parser.add_argument("ref_lexicon", metavar = "<reference-lexicon>", type = str,
+                        help = "The reference lexicon (most probably hand-derived)."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("g2p_lexicon", metavar = "<g2p-expanded-lexicon>", type = str,
+                        help = "Candidate ronouciations from G2P results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("pd_lexicon", metavar = "<phonetic-decoding-lexicon>", type = str,
+                        help = "Candidate ronouciations from phonetic decoding results."
+                        "Each line must be <word> <phones>")
+    parser.add_argument("learned_lexicon", metavar = "<learned-lexicon>", type = str,
+                        help = "Learned lexicon.")
+
+
+    print (' '.join(sys.argv), file=sys.stderr)
+
+    args = parser.parse_args()
+    args = CheckArgs(args)
+
+    return args
+
+def CheckArgs(args):
+    args.silence_phones_file_handle = open(args.silence_phones_file)
+    if args.arc_stats_file == "-":
+        args.arc_stats_file_handle = sys.stdin
+    else:
+        args.arc_stats_file_handle = open(args.arc_stats_file)
+    args.word_counts_file_handle = open(args.word_counts_file)
+    args.ref_lexicon_handle = open(args.ref_lexicon)
+    args.g2p_lexicon_handle = open(args.g2p_lexicon)
+    args.pd_lexicon_handle = open(args.pd_lexicon)
+    args.learned_lexicon_handle = open(args.learned_lexicon, "w")
+    
+    alpha = args.alpha.strip().split(',')
+    if len(alpha) is not 3:
+        raise Exception('Invalid alpha ', args.alpha)
+    for i in range(0,3):
+        if float(alpha[i]) < 0 or float(alpha[i]) > 1:
+            raise Exception('alaph ', alpha[i], 
+                            ' is invalid, it must be within [0, 1].')
+        if float(alpha[i]) == 0:
+            alpha[i] = -1e-3
+        # The absolute likelihood loss (search for loss_abs) is supposed to be positive.
+        # But it could be negative near zero because of numerical precision limit.
+        # In this case, even if alpha is set to be zero, which means we never want to
+        # remove pronunciation from that source, the quality score (search for q_b)
+        # could still be negative, which means this pron could be potentially removed.
+        # To prevent this, we set alpha as a negative value near zero to ensure
+        # q_b is always positive.
+
+    args.alpha = [float(alpha[0]), float(alpha[1]), float(alpha[2])]
+    print("[alpha_{pd}, alpha_{g2p}, alpha_{ref}] is: ", args.alpha)
+    exit
+    beta = args.beta.strip().split(',')
+    if len(beta) is not 3:
+        raise Exception('Invalid beta ', args.beta)
+    for i in range(0,3):
+        if float(beta[i]) < 0 or float(beta[i]) > 100:
+            raise Exception('beta ', beta[i], 
+                            ' is invalid, it must be within [0, 100].')
+    args.beta = [float(beta[0]), float(beta[1]), float(beta[2])]
+    print("[beta_{pd}, beta_{g2p}, beta_{ref}] is: ", args.beta)
+
+    if args.delta <= 0 or args.delta > 0.1:
+        raise Exception('delta ', args.delta, ' is invalid, it must be within'
+                        '(0, 0.01).')
+    print("delta is: ", args.delta)
+
+    return args
+
+def ReadArcStats(arc_stats_file_handle):
+    stats = defaultdict(lambda : defaultdict(dict))
+    stats_summed = defaultdict(float)
+    for line in arc_stats_file_handle.readlines():
+        splits = line.strip().split()
+
+        if (len(splits) == 0):
+            continue
+
+        if (len(splits) < 5):
+            raise Exception('Invalid format of line ' + line
+                                + ' in ' + arc_stats_file)
+        utt = splits[1]
+        start_frame = int(splits[2])
+        word = splits[0]
+        count = float(splits[3])
+        phones = splits[4:]
+        phones = ' '.join(phones)
+        stats[word][(utt, start_frame)][phones] = count
+        stats_summed[(word, phones)] += count
+    return stats, stats_summed
+
+def ReadWordCounts(word_counts_file_handle):
+    counts = {}
+    for line in word_counts_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in counts file.')
+        word = splits[0]
+        count = int(splits[1])
+        counts[word] = count
+    return counts
+
+def ReadLexicon(args, lexicon_file_handle, counts):
+    # we're skipping any word not in counts (not seen in training data),
+    # cause we're only learning prons for words who have acoustic examples.
+    lexicon = defaultdict(set)
+    for line in lexicon_file_handle.readlines():
+        splits = line.strip().split()
+        if len(splits) == 0:
+            continue
+        if len(splits) < 2:
+            raise Exception('Invalid format of line ' + line
+                                + ' in lexicon file.')
+        word = splits[0]
+        if word not in counts:
+            continue
+        phones = ' '.join(splits[1:])
+        lexicon[word].add(phones)
+    return lexicon
+
+def FilterPhoneticDecodingLexicon(args, pd_lexicon):
+    # We want to remove all candidates which contain silence phones
+    silphones = set()
+    for line in args.silence_phones_file_handle:
+        silphones.add(line.strip())
+    rejected_candidates = set()
+    for word, prons in pd_lexicon.iteritems():
+        for pron in prons:
+            for phone in pron.split():
+                if phone in silphones:
+                   rejected_candidates.add((word, pron))
+                   break
+    for word, pron in rejected_candidates:
+        pd_lexicon[word].remove(pron)
+    return pd_lexicon
+
+# One iteration of Expectation-Maximization computation (Eq. 3-4 in the paper).
+def OneEMIter(args, word, stats, prons, pron_probs, debug=False):
+    prob_acc = [0.0 for i in range(len(prons[word]))]
+    s = sum(pron_probs)
+    for i in range(len(pron_probs)):
+        pron_probs[i] = pron_probs[i] / s
+    log_like = 0.0
+    for (utt, start_frame) in stats[word]:
+        prob = []
+        soft_counts = []
+        for i in range(len(prons[word])):
+            phones = prons[word][i]
+            soft_count = stats[word][(utt, start_frame)].get(phones, 0)
+            if soft_count < args.delta: 
+                soft_count = args.delta
+            soft_counts.append(soft_count)
+        prob = [i[0] * i[1] for i in zip(soft_counts, pron_probs)]
+        for i in range(len(prons[word])):
+            prob_acc[i] += prob[i] / sum(prob)
+        log_like += math.log(sum(prob))
+    pron_probs = [1.0 / float(len(stats[word])) * p for p in prob_acc]
+    log_like = 1.0 / float(len(stats[word])) * log_like
+    if debug:
+        print("Log_like of the word: ", log_like, "pron probs: ", pron_probs)
+    return pron_probs, log_like
+
+def SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon, dianostic_info=False):
+    prons = defaultdict(list) # Put all possible prons from three source lexicons into this dictionary
+    src = {} # Source of each (word, pron) pair: 'P' = phonetic-decoding, 'G' = G2P, 'R' = reference
+    learned_lexicon = defaultdict(set) # Put all selected prons in this dictionary
+    for lexicon in ref_lexicon, g2p_lexicon, pd_lexicon:
+        for word in lexicon:
+            for pron in lexicon[word]:
+                prons[word].append(pron)
+    for word in prons:
+        for pron in prons[word]:
+            if word in pd_lexicon and pron in pd_lexicon[word]:
+                src[(word, pron)] = 'P'
+            if word in g2p_lexicon and pron in g2p_lexicon[word]:
+                src[(word, pron)] = 'G'
+            if word in ref_lexicon and pron in ref_lexicon[word]:
+                src[(word, pron)] = 'R'
+   
+    for word in prons:
+        if word not in stats:
+            continue
+        n = len(prons[word])
+        pron_probs = [1/float(n) for i in range(n)]
+        if dianostic_info:
+            print("pronunciations of word '{}': {}".format(word, prons[word]))
+        active_indexes = set(range(len(prons[word])))
+       
+        deleted_prons = [] # indexes of prons to be deleted
+        soft_counts_normalized = []
+        while len(active_indexes) > 1:
+            log_like = 1.0
+            log_like_last = -1.0
+            num_iters = 0
+            while abs(log_like - log_like_last) > 1e-7:
+                num_iters += 1
+                log_like_last = log_like
+                pron_probs, log_like = OneEMIter(args, word, stats, prons, pron_probs, False)
+                if log_like_last == 1.0 and len(soft_counts_normalized) == 0: # the first iteration
+                    soft_counts_normalized = pron_probs
+                    if dianostic_info: 
+                        print("Avg.(over all egs) soft counts: {}".format(soft_counts_normalized))
+            if dianostic_info:
+                print("\n Log_like after {} iters of EM: {}, estimated pron_probs: {} \n".format(
+                        num_iters, log_like, pron_probs))
+            candidates_to_delete = []
+            
+            for i in active_indexes:
+                pron_probs_mod = [p for p in pron_probs]
+                pron_probs_mod[i] = 0.0
+                for j in range(len(pron_probs_mod)):
+                    if j in active_indexes and j != i:
+                        pron_probs_mod[j] += 0.01
+                pron_probs_mod = [s / sum(pron_probs_mod) for s in pron_probs_mod]
+                log_like2 = 1.0
+                log_like2_last = -1.0
+                num_iters2 = 0
+                # Running EM until convengence
+                while abs(log_like2 - log_like2_last) > 0.001 :
+                    num_iters2 += 1
+                    log_like2_last = log_like2
+                    pron_probs_mod, log_like2 = OneEMIter(args, word, stats,
+                                                          prons, pron_probs_mod, False)
+                
+                loss_abs = log_like - log_like2 # absolute likelihood loss before normalization
+                # (supposed to be positive, but could be negative near zero because of numerical precision limit).
+                log_delta = math.log(args.delta)
+                thr = -log_delta
+                loss = loss_abs
+                source = src[(word, prons[word][i])]
+                if dianostic_info:
+                    print("\n set the pron_prob of '{}' whose source is {}, to zero results in {}"
+                    " loss in avg. log-likelihood; Num. iters until converging:{}. ".format(
+                      prons[word][i], source, loss, num_iters2))
+                # Compute quality score q_b = loss_abs * / (M_w + beta_s(b)) + alpha_s(b) * log_delta
+                # See Sec. 4.3 and Alg. 1 in the paper.
+                if source == 'P':
+                   thr *= args.alpha[0]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[0])
+                if source == 'G':
+                   thr *= args.alpha[1]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[1])
+                if source == 'R':
+                   thr *= args.alpha[2]
+                   loss *= float(len(stats[word])) / (float(len(stats[word])) + args.beta[2])
+                if loss - thr < 0: # loss - thr here is just q_b
+                   if dianostic_info:
+                       print("Smoothed log-like loss {} is smaller than threshold {} so that the quality"
+                             "score {} is negative, adding the pron to the list of candidates to delete"
+                             ". ".format(loss, thr, loss-thr))
+                   candidates_to_delete.append((loss-thr, i))
+            if len(candidates_to_delete) == 0:
+                break
+            candidates_to_delete_sorted = sorted(candidates_to_delete, 
+                                                 key=lambda candidates_to_delete: candidates_to_delete[0])
+
+            deleted_candidate = candidates_to_delete_sorted[0]
+            active_indexes.remove(deleted_candidate[1])
+            pron_probs[deleted_candidate[1]] = 0.0
+            for i in range(len(pron_probs)):
+                if i in active_indexes:
+                    pron_probs[i] += 0.01
+            pron_probs = [s / sum(pron_probs) for s in pron_probs]
+            source = src[(word, prons[word][deleted_candidate[1]])]
+            pron = prons[word][deleted_candidate[1]]
+            soft_count = soft_counts_normalized[deleted_candidate[1]]
+            quality_score = deleted_candidate[0]
+            # This part of diagnostic info provides hints to the user on how to adjust the parameters.
+            if dianostic_info:
+                print("removed pron {}, from source {} with quality score {:.5f}".format(
+                        pron, source, quality_score)) 
+                if (source == 'P' and soft_count > 0.7 and len(stats[word]) > 5):
+                    print("WARNING: alpha_{pd} or beta_{pd} may be too large!"
+                          "    For the word '{}' whose count is {}, the candidate "
+                          "    pronunciation from phonetic decoding '{}' with normalized "
+                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
+                          "    rejected if alpha_{pd} is smaller than {}".format(
+                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
+                            -args.alpha[0] * len(stats[word]) + (objf_change + args.beta[0])),
+                            file=sys.stderr)
+                    if loss_abs > thr:
+                        print("    or beta_{pd} is smaller than {}".format(
+                                (loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
+                if (source == 'G' and soft_count > 0.7 and len(stats[word]) > 5):
+                    print("WARNING: alpha_{g2p} or beta_{g2p} may be too large!"
+                          "    For the word '{}' whose count is {}, the candidate "
+                          "    pronunciation from G2P '{}' with normalized "
+                          "    soft count {} (out of 1) is rejected. It shouldn't have been"
+                          "    rejected if alpha_{g2p} is smaller than {} ".format(
+                            word, len(stats[word]), pron, soft_count, -loss / log_delta, 
+                            -args.alpha[1] * len(stats[word]) + (objf_change + args.beta[1])),
+                          file=sys.stderr)
+                    if loss_abs > thr:
+                        print("    or beta_{g2p} is smaller than {}.".format((
+                                loss_abs / thr - 1) * len(stats[word])), file=sys.stderr)
+            deleted_prons.append(deleted_candidate[1])
+        for i in range(len(prons[word])):
+            if i not in deleted_prons:
+                learned_lexicon[word].add(prons[word][i])
+
+    return learned_lexicon
+
+def WriteLearnedLexicon(learned_lexicon, file_handle):
+    for word, prons in learned_lexicon.iteritems():
+        for pron in prons:
+            print('{0} {1}'.format(word, pron), file=file_handle)
+    file_handle.close()
+
+def Main():
+    args = GetArgs()
+    
+    # Read in three lexicon sources, word counts, and pron stats.
+    counts = ReadWordCounts(args.word_counts_file_handle)
+    ref_lexicon = ReadLexicon(args, args.ref_lexicon_handle, counts)
+    g2p_lexicon = ReadLexicon(args, args.g2p_lexicon_handle, counts)
+    pd_lexicon =  ReadLexicon(args, args.pd_lexicon_handle, counts)
+    stats, stats_summed = ReadArcStats(args.arc_stats_file_handle)
+    pd_lexicon = FilterPhoneticDecodingLexicon(args, pd_lexicon)
+                  
+    # Select prons to construct the learned lexicon.
+    learned_lexicon = SelectPronsGreedy(args, stats, counts, ref_lexicon, g2p_lexicon, pd_lexicon)
+    
+    # Write the learned prons for words out of the ref. vocab into learned_lexicon_oov.
+    WriteLearnedLexicon(learned_lexicon, args.learned_lexicon_handle)
+
+if __name__ == "__main__":
+    Main()

From ae555cf795004402b540ad876b74a2eb09459358 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Thu, 25 Oct 2018 11:14:15 -0400
Subject: [PATCH 2/4] added memory compression option for wsj tdnn recipe

---
 egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
index 1724c057e12..526059b7b90 100755
--- a/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
+++ b/egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
@@ -220,6 +220,7 @@ if [ $stage -le 16 ]; then
     --chain.apply-deriv-weights=false \
     --chain.lm-opts="--num-extra-lm-states=2000" \
     --trainer.dropout-schedule $dropout_schedule \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=10 \

From 7a74551eb26a5cde5cc5586a92833be8a59d0087 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Thu, 25 Oct 2018 11:29:37 -0400
Subject: [PATCH 3/4] rename the old lexicon learning recipe
 steps/dict/learn_lexicon.sh to steps/dict/learn_lexicon_bayesian.sh

---
 .../local/{run_learn_lex.sh => run_learn_lex_bayesian.sh}     | 4 ++--
 egs/wsj/s5/steps/dict/apply_lexicon_edits.py                  | 2 +-
 egs/wsj/s5/steps/dict/get_pron_stats.py                       | 2 +-
 egs/wsj/s5/steps/dict/internal/sum_arc_info.py                | 2 +-
 .../dict/{learn_lexicon.sh => learn_lexicon_bayesian.sh}      | 0
 egs/wsj/s5/steps/dict/prons_to_lexicon.py                     | 2 +-
 egs/wsj/s5/steps/dict/prune_pron_candidates.py                | 2 +-
 egs/wsj/s5/steps/dict/select_prons_bayesian.py                | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)
 rename egs/tedlium/s5_r2/local/{run_learn_lex.sh => run_learn_lex_bayesian.sh} (98%)
 rename egs/wsj/s5/steps/dict/{learn_lexicon.sh => learn_lexicon_bayesian.sh} (100%)

diff --git a/egs/tedlium/s5_r2/local/run_learn_lex.sh b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
similarity index 98%
rename from egs/tedlium/s5_r2/local/run_learn_lex.sh
rename to egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
index a2a6f2e46b8..f1497bfe202 100755
--- a/egs/tedlium/s5_r2/local/run_learn_lex.sh
+++ b/egs/tedlium/s5_r2/local/run_learn_lex_bayesian.sh
@@ -2,7 +2,7 @@
 #
 # This script demonstrates a lexicon learning recipe, which aims to imrove
 # the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
-# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh
+# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh
 # for explanation of the options. 
 #
 # Copyright 2016  Xiaohui Zhang
@@ -78,7 +78,7 @@ fi
 
 # Learn a lexicon based on the acoustic training data and the reference lexicon.
 if [ $stage -le 1 ]; then
-  steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
+  steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
     --min-prob $min_prob --variants-prob-mass $variants_prob_mass \
     --variants-prob-mass-ref $variants_prob_mass_ref  \
     --prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \
diff --git a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
index a5bdbc30d46..f8568971fb7 100755
--- a/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
+++ b/egs/wsj/s5/steps/dict/apply_lexicon_edits.py
@@ -10,7 +10,7 @@
 def GetArgs():
     parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
                                      "to produce a learned lexicon.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
                         help = "Input lexicon. Each line must be <word> <phones>.")
diff --git a/egs/wsj/s5/steps/dict/get_pron_stats.py b/egs/wsj/s5/steps/dict/get_pron_stats.py
index 41866294723..f6ce8e49807 100755
--- a/egs/wsj/s5/steps/dict/get_pron_stats.py
+++ b/egs/wsj/s5/steps/dict/get_pron_stats.py
@@ -19,7 +19,7 @@ def GetArgs():
         epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
         "  steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
         "  exp/tri3_lex_0.4_work/lats/pron_stats.txt"
-        "See steps/dict/learn_lexicon.sh for examples in detail.")
+        "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")
 
     parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
                         help = "Input file containing per arc statistics; "
diff --git a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
index d3913ec954f..5f02bc5fc29 100755
--- a/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
+++ b/egs/wsj/s5/steps/dict/internal/sum_arc_info.py
@@ -24,7 +24,7 @@ def GetArgs():
     parser = argparse.ArgumentParser(
         description = "Accumulate statistics from per arc lattice statitics"
         "for lexicon learning",
-        epilog = "See steps/dict/learn_lexicon.sh for example")
+        epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("--set-sum-to-one", type = str, default = True,
                         action = StrToBoolAction, choices = ["true", "false"],
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
similarity index 100%
rename from egs/wsj/s5/steps/dict/learn_lexicon.sh
rename to egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
diff --git a/egs/wsj/s5/steps/dict/prons_to_lexicon.py b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
index a957b02d3d3..37d7810411b 100755
--- a/egs/wsj/s5/steps/dict/prons_to_lexicon.py
+++ b/egs/wsj/s5/steps/dict/prons_to_lexicon.py
@@ -30,7 +30,7 @@ def GetArgs():
                                      "--filter-lexicon=exp/tri3_lex_0.4_work/phone_decode/filter_lexicon.txt \\"
                                      "exp/tri3_lex_0.4_work/phone_decode/prons.txt \\"
                                      "exp/tri3_lex_0.4_work/lexicon_phone_decoding.txt"
-                                     "See steps/dict/learn_lexicon.sh for examples in detail.")
+                                     "See steps/dict/learn_lexicon_greedy.sh for examples in detail.")
 
     parser.add_argument("--set-sum-to-one", type = str, default = False,
                         action = StrToBoolAction, choices = ["true", "false"],
diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
index affc5b17705..0f64f38b785 100755
--- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py
+++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
@@ -16,7 +16,7 @@ def GetArgs():
                                      "(For words in the reference lexicon, N = # pron variants given by the reference"
                                      "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
                                      "r is a user-specified constant, like 2.",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_geedy.sh for example")
 
     parser.add_argument("--r", type = float, default = "2.0",
                         help = "a user-specified ratio parameter which determines how many"
diff --git a/egs/wsj/s5/steps/dict/select_prons_bayesian.py b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
index e728a4af0b8..4ccca302ebf 100755
--- a/egs/wsj/s5/steps/dict/select_prons_bayesian.py
+++ b/egs/wsj/s5/steps/dict/select_prons_bayesian.py
@@ -23,7 +23,7 @@ def GetArgs():
                                      "a learned lexicon for words out of the ref. vocab (learned_lexicon_oov),"
                                      "and a lexicon_edits file containing suggested modifications of prons, for"
                                      "words within the ref. vocab (ref_lexicon_edits).",
-                                     epilog = "See steps/dict/learn_lexicon.sh for example.")
+                                     epilog = "See steps/dict/learn_lexicon_bayesian.sh for example.")
     parser.add_argument("--prior-mean", type = str, default = "0,0,0",
                         help = "Mean of priors (summing up to 1) assigned to three exclusive n"
                         "pronunciatio sources: reference lexicon, g2p, and phonetic decoding. We "

From b147dc87f7665099f87fef4f03a3a3dd0d282893 Mon Sep 17 00:00:00 2001
From: xiaohui-zhang <samuelzhang1104@gmail.com>
Date: Fri, 14 Dec 2018 01:25:39 -0500
Subject: [PATCH 4/4] fixed typos

---
 egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh | 2 +-
 egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh   | 2 +-
 egs/wsj/s5/steps/dict/prune_pron_candidates.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
index ae9681ebab3..042f8f94da4 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_bayesian.sh
@@ -28,7 +28,7 @@
 
 
 # Begin configuration section.  
-cmd=queue.pl
+cmd=run.pl
 nj=4
 stage=0
 
diff --git a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
index 83aa98c1700..56e85f20d62 100755
--- a/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
+++ b/egs/wsj/s5/steps/dict/learn_lexicon_greedy.sh
@@ -40,7 +40,7 @@
 
 stage=0
 # Begin configuration section.  
-cmd=queue.pl
+cmd=run.pl
 nj=
 stage=0
 oov_symbol=
diff --git a/egs/wsj/s5/steps/dict/prune_pron_candidates.py b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
index 0f64f38b785..e32478cecea 100755
--- a/egs/wsj/s5/steps/dict/prune_pron_candidates.py
+++ b/egs/wsj/s5/steps/dict/prune_pron_candidates.py
@@ -16,7 +16,7 @@ def GetArgs():
                                      "(For words in the reference lexicon, N = # pron variants given by the reference"
                                      "lexicon; For oov words, N = avg. # pron variants per word in the reference lexicon)."
                                      "r is a user-specified constant, like 2.",
-                                     epilog = "See steps/dict/learn_lexicon_geedy.sh for example")
+                                     epilog = "See steps/dict/learn_lexicon_greedy.sh for example")
 
     parser.add_argument("--r", type = float, default = "2.0",
                         help = "a user-specified ratio parameter which determines how many"