Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# This script demonstrates a lexicon learning recipe, which aims to imrove
# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon.sh
# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_bayesian.sh
# for explanation of the options.
#
# Copyright 2016 Xiaohui Zhang
Expand Down Expand Up @@ -78,7 +78,7 @@ fi

# Learn a lexicon based on the acoustic training data and the reference lexicon.
if [ $stage -le 1 ]; then
steps/dict/learn_lexicon.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
steps/dict/learn_lexicon_bayesian.sh --lexicon-g2p "$data/lexicon_oov_g2p.txt" \
--min-prob $min_prob --variants-prob-mass $variants_prob_mass \
--variants-prob-mass-ref $variants_prob_mass_ref \
--prior-counts-tot $prior_counts_tot --prior-mean $prior_mean \
Expand Down
133 changes: 133 additions & 0 deletions egs/tedlium/s5_r2/local/run_learn_lex_greedy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#! /bin/bash
#
# This script demonstrates a lexicon learning recipe, which aims to imrove
# the pronounciation of abbreviated words in the TED-LIUM lexicon. It assumes
# the model exp/tri3 already exists. Please see steps/dict/learn_lexicon_greedy.sh
# for explanation of the options.
#
# Copyright 2018 Xiaohui Zhang
# Apache 2.0

. ./cmd.sh
. ./path.sh

oov_symbol="<unk>"
# The user may have an phonetisaurus-trained English g2p model ready.
g2p_mdl_dir=
# The dir which contains the reference lexicon (most probably hand-derived)
# we want to expand/improve, and nonsilence_phones.txt,.etc which we need
# for building new dict dirs.
ref_dict=data/local/dict
# acoustic training data we use to get alternative
# pronunciations and collet acoustic evidence.
data=data/train
# the cut-off parameter used to select pronunciation candidates from phone
# decoding. We remove pronunciations with probabilities less than this value
# after normalizing the probs s.t. the max-prob is 1.0 for each word."
min_prob=0.1
# Refer to steps/dict/select_prons_greedy.sh for the detailed meaning of
# alpha, beta and delta. Basically, the three dimensions of alpha
# and beta correspond to three pronunciation sources: phonetic-
# decoding, G2P and the reference lexicon, and the larger a value is,
# the more aggressive we'll prune pronunciations from that sooure.
# The valid range of each dim. is [0, 1] (for alpha, and 0 means
# we never pruned pron from that source.) [0, 100] (for beta).
alpha="0.04,0.02,0"
beta="30,5,0"
# Floor value of the pronunciation posterior statistics.
delta=0.00000001
# This parameter determines how many pronunciations we keep for each word
# after the first pass pruning. See steps/dict/internal/prune_pron_candidates.py
# for details.
vcr=16

# Intermediate outputs of the lexicon learning stage will be put into dir
dir=exp/tri3_lex_greedy_work
nj=35
decode_nj=30
stage=0
lexlearn_stage=0
affix="learned_greedy"

. utils/parse_options.sh # accept options

# The reference vocab is the list of words which we already have hand-derived pronunciations.
ref_vocab=data/local/vocab.txt
cat $ref_dict/lexicon.txt | awk '{print $1}' | sort | uniq > $ref_vocab || exit 1;

# Get a G2P generated lexicon for oov words (w.r.t the reference lexicon)
# in acoustic training data.
if [ $stage -le 0 ]; then
if [ -z $g2p_mdl_dir ]; then
g2p_mdl_dir=exp/g2p_phonetisaurus
steps/dict/train_g2p_phonetisaurus.sh $ref_dict/lexicon.txt $g2p_mdl_dir || exit 1;
fi
awk '{for (n=2;n<=NF;n++) vocab[$n]=1;} END{for (w in vocab) printf "%s\n",w;}' \
$data/text | sort -u > $data/train_vocab.txt || exit 1;
awk 'NR==FNR{a[$1] = 1; next} {if(!($1 in a)) print $1}' $ref_vocab \
$data/train_vocab.txt | sort > $data/oov_train.txt || exit 1;
steps/dict/apply_g2p_phonetisaurus.sh --nbest 5 $data/train_vocab.txt $g2p_mdl_dir \
exp/g2p_phonetisaurus/lex_train || exit 1;
fi

# Learn a lexicon based on the acoustic training data and the reference lexicon.
if [ $stage -le 1 ]; then
steps/dict/learn_lexicon_greedy.sh --lexiconp-g2p "exp/g2p_phonetisaurus/lex_train/lexicon.lex" \
--alpha $alpha --beta $beta --delta $delta \
--min-prob $min_prob --cmd "$train_cmd" \
--variant-counts-ratio $vcr \
--stage $lexlearn_stage --nj 60 --oov-symbol $oov_symbol --retrain-src-mdl false \
$ref_dict $ref_vocab $data exp/tri3 data/lang data/local/dict_${affix}_nosp \
$dir || exit 1;
fi

# Add pronounciation probs to the learned lexicon.
if [ $stage -le 2 ]; then
utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
data/local/dict_${affix}_nosp $oov_symbol data/local/lang_${affix}_nosp data/lang_${affix}_nosp || exit 1;

steps/align_si.sh --nj $nj --cmd "$train_cmd" \
$data data/lang_${affix}_nosp exp/tri2 exp/tri2_ali_${affix}_nosp || exit 1;

steps/get_prons.sh --cmd "$train_cmd" data/train data/lang_${affix}_nosp exp/tri2_ali_${affix}_nosp || exit 1;

utils/dict_dir_add_pronprobs.sh --max-normalize true \
data/local/dict_${affix}_nosp exp/tri2_ali_${affix}_nosp/pron_counts_nowb.txt \
exp/tri2_ali_${affix}_nosp/sil_counts_nowb.txt \
exp/tri2_ali_${affix}_nosp/pron_bigram_counts_nowb.txt data/local/dict_${affix} || exit 1;

utils/prepare_lang.sh --phone-symbol-table data/lang/phones.txt \
data/local/dict_${affix} $oov_symbol data/local/lang_${affix} data/lang_${affix} || exit 1;
fi

# Re-decode
if [ $stage -le 3 ]; then
! cmp data/lang_nosp/words.txt data/lang_${affix}/words.txt &&\
echo "$0: The vocab of the affix lexicon and the reference vocab may be incompatible."
cp data/lang_nosp/G.fst data/lang_${affix}/
utils/mkgraph.sh data/lang_${affix} exp/tri3 exp/tri3/graph_${affix} || exit 1;

for dset in dev test; do
( steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \
exp/tri3/graph_${affix} data/${dset} exp/tri3/decode_${affix}_${dset} || exit 1;
) &
done
fi

# RESULTS:
# Baseline:
# %WER 18.7 | 507 17783 | 83.9 11.4 4.7 2.6 18.7 92.3 | -0.006 | exp/tri3/decode_dev/score_17_0.0/ctm.filt.filt.sys
# %WER 17.6 | 1155 27500 | 84.7 11.6 3.7 2.4 17.6 87.2 | 0.013 | exp/tri3/decode_test/score_15_0.0/ctm.filt.filt.sys

# Re-decoding with the learned lexicon:
# %WER 18.5 | 507 17783 | 84.3 11.2 4.5 2.8 18.5 92.3 | -0.007 | exp/tri3/decode_learned_greedy_dev/score_16_0.0/ctm.filt.filt.sys
# %WER 17.5 | 1155 27500 | 84.9 11.5 3.6 2.4 17.5 87.5 | 0.035 | exp/tri3/decode_learned_greedy_test/score_14_0.0/ctm.filt.filt.sys

# To see the effect to neural-net results, one should re-train NN with the learned lexicon.
# Experiments have shown that, with the new lang dir, one should just re-run NN training
# starting from the supervision generation (steps/align_fmllr_lats.sh) stage, and should
# expect improved overall WERs and word recognition performance on words whose pronunciations
# were changed.

exit
wait
1 change: 1 addition & 0 deletions egs/wsj/s5/local/chain/tuning/run_tdnn_1g.sh
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ if [ $stage -le 16 ]; then
--chain.apply-deriv-weights=false \
--chain.lm-opts="--num-extra-lm-states=2000" \
--trainer.dropout-schedule $dropout_schedule \
--trainer.add-option="--optimization.memory-compression-level=2" \
--trainer.srand=$srand \
--trainer.max-param-change=2.0 \
--trainer.num-epochs=10 \
Expand Down
2 changes: 1 addition & 1 deletion egs/wsj/s5/steps/dict/apply_lexicon_edits.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
def GetArgs():
parser = argparse.ArgumentParser(description = "Apply an lexicon edits file (output from steps/dict/select_prons_bayesian.py)to an input lexicon"
"to produce a learned lexicon.",
epilog = "See steps/dict/learn_lexicon.sh for example")
epilog = "See steps/dict/learn_lexicon_greedy.sh for example")

parser.add_argument("in_lexicon", metavar='<in-lexicon>', type = str,
help = "Input lexicon. Each line must be <word> <phones>.")
Expand Down
19 changes: 10 additions & 9 deletions egs/wsj/s5/steps/dict/get_pron_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,16 @@
import sys

def GetArgs():
parser = argparse.ArgumentParser(description = "Accumulate statistics from lattice-alignment outputs for lexicon"
"learning. The inputs are a file containing arc level information from lattice-align-words,"
"and a map which maps word-position-dependent phones to word-position-independent phones"
"(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
"of pronunciations",
epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
" steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
" exp/tri3_lex_0.4_work/lats/pron_stats.txt"
"See steps/dict/learn_lexicon.sh for examples in detail.")
parser = argparse.ArgumentParser(
description = "Accumulate statistics from lattice-alignment outputs for lexicon"
"learning. The inputs are a file containing arc level information from lattice-align-words,"
"and a map which maps word-position-dependent phones to word-position-independent phones"
"(output from steps/cleanup/debug_lexicon.txt). The output contains accumulated soft-counts"
"of pronunciations",
epilog = "cat exp/tri3_lex_0.4_work/lats/arc_info_sym.*.txt \\|"
" steps/dict/get_pron_stats.py - exp/tri3_lex_0.4_work/phone_decode/phone_map.txt \\"
" exp/tri3_lex_0.4_work/lats/pron_stats.txt"
"See steps/dict/learn_lexicon_greedy.sh for examples in detail.")

parser.add_argument("arc_info_file", metavar = "<arc-info-file>", type = str,
help = "Input file containing per arc statistics; "
Expand Down
140 changes: 140 additions & 0 deletions egs/wsj/s5/steps/dict/internal/get_subsegments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#!/usr/bin/env python

# Copyright 2018 Xiaohui Zhang
# Apache 2.0.

# we're using python 3.x style print but want it to work in python 2.x,
from __future__ import print_function
import argparse
import sys
import string

def GetArgs():
parser = argparse.ArgumentParser(
description = "The purpose of this script is to use a ctm and a vocab file"
"to extract sub-utterances and a sub-segmentation. Extracted sub-utterances"
"are all the strings of consecutive in-vocab words from the ctm"
"surrounded by an out-of-vocab word at each end if present.",
epilog = "e.g. steps/dict/internal/get_subsegments.py exp/tri3_lex_0.4_work/phonetic_decoding/word.ctm \\"
"exp/tri3_lex_0.4_work/learn_vocab.txt exp/tri3_lex_0.4_work/resegmentation/subsegments \\"
"exp/tri3_lex_0.4_work/resegmentation/text"
"See steps/dict/learn_lexicon_greedy.sh for an example.")

parser.add_argument("ctm", metavar='<ctm>', type = str,
help = "Input ctm file."
"each line must be <utt-id> <chanel> <start-time> <duration> <word>")
parser.add_argument("vocab", metavar='<vocab>', type = str,
help = "Vocab file."
"each line must be <word>")
parser.add_argument("subsegment", metavar='<subsegtment>', type = str,
help = "Subsegment file. Each line is in format:"
"<new-utt> <old-utt> <start-time-within-old-utt> <end-time-within-old-utt>")
parser.add_argument("text", metavar='<text>', type = str,
help = "Text file. Each line is in format:"
" <new-utt> <word1> <word2> ... <wordN>.")

print (' '.join(sys.argv), file = sys.stderr)

args = parser.parse_args()
args = CheckArgs(args)

return args

def CheckArgs(args):
if args.ctm == "-":
args.ctm_handle = sys.stdin
else:
args.ctm_handle = open(args.ctm)

if args.vocab is not '':
if args.vocab == "-":
args.vocab_handle = sys.stdout
else:
args.vocab_handle = open(args.vocab)

args.subsegment_handle = open(args.subsegment, 'w')
args.text_handle = open(args.text, 'w')

return args

def GetSubsegments(args, vocab):
sub_utt = list()
last_is_oov = False
is_oov = False
utt_id_last = None
start_times = {}
end_times = {}
sub_utts = {}
sub_utt_id = 1
sub_utt_id_last = 1
end_time_last = 0.0
for line in args.ctm_handle:
splits = line.strip().split()
if len(splits) < 5:
raise Exception("problematic line",line)

utt_id = splits[0]
start = float(splits[2])
dur = float(splits[3])
word = splits[4]
if utt_id != utt_id_last:
sub_utt_id = 1
if len(sub_utt)>1:
sub_utts[utt_id_last+'-'+str(sub_utt_id_last)] = (utt_id_last, sub_utt)
end_times[utt_id_last+'-'+str(sub_utt_id_last)] = ent_time_last
sub_utt = []
start_times[utt_id+'-'+str(sub_utt_id)] = start
is_oov_last = False
if word == '<eps>':
is_oov = True
end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
elif word in vocab:
is_oov = True
sub_utt.append(word)
end_times[utt_id+'-'+str(sub_utt_id)] = start + dur
else:
is_oov = False
if is_oov_last == True:
sub_utt.append(word)
sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur
sub_utt_id += 1
sub_utt = [word]
start_times[utt_id+'-'+str(sub_utt_id)] = start
utt_id_last = utt_id
sub_utt_id_last = sub_utt_id
is_oov_last = is_oov
ent_time_last = start + dur

if is_oov:
if word != '<eps>':
sub_utt.append(word)
sub_utts[utt_id+'-'+str(sub_utt_id_last)] = (utt_id, sub_utt)
end_times[utt_id+'-'+str(sub_utt_id_last)] = start + dur

for utt,v in sorted(sub_utts.items()):
print(utt, ' '.join(sub_utts[utt][1]), file=args.text_handle)
print(utt, sub_utts[utt][0], start_times[utt], end_times[utt], file=args.subsegment_handle)

def ReadVocab(vocab_file_handle):
vocab = set()
if vocab_file_handle:
for line in vocab_file_handle.readlines():
splits = line.strip().split()
if len(splits) == 0:
continue
if len(splits) > 1:
raise Exception('Invalid format of line ' + line
+ ' in vocab file.')
word = splits[0]
vocab.add(word)
return vocab

def Main():
args = GetArgs()

vocab = ReadVocab(args.vocab_handle)
GetSubsegments(args, vocab)

if __name__ == "__main__":
Main()
Loading