Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 0 additions & 42 deletions egs/multi_en/s5/local/g2p/apply_g2p.sh

This file was deleted.

67 changes: 0 additions & 67 deletions egs/multi_en/s5/local/g2p/train_g2p.sh

This file was deleted.

27 changes: 23 additions & 4 deletions egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ if [ $stage -le 1 ]; then
# We prepare the basic dictionary in data/local/dict_combined.
local/prepare_dict.sh $swbd $tedlium2
(
local/g2p/train_g2p.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
steps/dict/train_g2p_phonetisaurus.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined/lexicon.txt exp/g2p || touch exp/g2p/.error
) &
fi

Expand Down Expand Up @@ -114,8 +114,27 @@ if [ $stage -le 4 ]; then
mkdir -p $dict_dir
rm $dict_dir/lexiconp.txt 2>/dev/null || true
cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;

echo 'Gathering missing words...'

lexicon=data/local/dict_combined/lexicon.txt
g2p_tmp_dir=data/local/g2p_phonetisarus
mkdir -p $g2p_tmp_dir

# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
cat data/*/train/text | \
local/count_oovs.pl $lexicon | \
awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
perl -ape 's/\s/\n/g;' | \
sort | uniq > $g2p_tmp_dir/missing.txt
cat $g2p_tmp_dir/missing.txt | \
grep "^[a-z]*$" > $g2p_tmp_dir/missing_onlywords.txt

steps/dict/apply_g2p_phonetisaurus.sh --nbest 1 exp/g2p/model.fst $g2p_tmp_dir/missing_onlywords.txt $g2p_tmp_dir/missing_lexicon.txt || exit 1;

extended_lexicon=$dict_dir/lexicon.txt
echo "Adding new pronunciations to get expanded lexicon $extended_lexicon"
cat <(cut -f 1,3 $g2p_tmp_dir/missing_lexicon.txt) $lexicon | sort | uniq > $extended_lexicon
fi

# We'll do multiple iterations of pron/sil-prob estimation. So the structure of
Expand Down
60 changes: 60 additions & 0 deletions egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache 2.0

# This script applies a trained Phonetisarus G2P model to
# synthesize pronunciations for missing words (i.e., words in
# transcripts but not the lexicon), and output the expanded lexicon.
# The user could specify either nbest or pmass option
# to determine the number of output pronunciation variants,
# or use them together to get the intersection of two options.

# Begin configuration section.
stage=0
nbest= # Generate up to $nbest variants
pmass= # Generate so many variants to produce $pmass ammount, like 90%, of the prob mass
# End configuration section.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huangruizhe Can you add "thresh" as an option here? Please refer to /export/b19/xzhang/tedlium/s5_r2/steps/dict/apply_g2p.sh (Sorry I just realized today that I already wrote a script like the current one 2 years ago.. ) Also, please explain a bit more about the nbest and pmass options, also by referring to the above script. Thanks!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.


echo "$0 $@" # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
echo "Usage: $0 [options] <g2p-model> <word-list> <lexicon-out>"
echo "... where <g2p-model> is the trained g2p model."
echo " <word-list> is a list of words whose pronunciation is to be generated."
echo " <lexicon-out> output lexicon, whose format is <word>\t<prob>\t<pronunciation> for each line."
echo "e.g.: $0 --nbest 1 exp/g2p/model.fst exp/g2p/oov_words.txt data/local/dict_nosp/lexicon.txt"
echo ""
echo "main options (for others, see top of script file)"
echo " --nbest <int> # Maximum number of hypotheses to produce. By default, nbest=20"
echo " --pmass <float> # Select the maximum number of hypotheses summing to a total mass of pmass amount, within [0, 1], for a word. By default, pmass=1.0"
exit 1;
fi

model=$1
word_list=$2
out_lexicon=$3
out_lexicon_failed="${out_lexicon}.failed"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, to keep the convention, probably you should ask the user to specify $outdir and write the output lexicon as $outdir/lexicon.lex as was done in the current apply_g2p.sh. And then put the list of failed words as $outdir/lexicon.failed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. Many thanks for all the above suggestions!


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check whether phonetisaurus is installed here. Please refer to /export/b19/xzhang/tedlium/s5_r2/steps/dict/apply_g2p.sh also.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

[ -z $pmass ] && [ -z $nbest ] && echo "$0: nbest or/and pmass should be specified." && exit 1;

# three options: 1) nbest, 2) pmass, 3) nbest+pmass,
nbest=${nbest:-20} # if nbest is not specified, set it to 20, due to Phonetisaurus mechanism
pmass=${pmass:-1.0} # if pmass is not specified, set it to 1.0, due to Phonetisaurus mechanism

[[ ! $nbest =~ ^[1-9][0-9]*$ ]] && echo "$0: nbest should be a positive integer." && exit 1;

echo "$0: Synthesizing pronunciations for words in $word_list based on nbest=$nbest and pmass=$pmass"
phonetisaurus-apply --pmass $pmass --nbest $nbest --model $model --thresh 5 --accumulate --verbose --prob --word_list $word_list \
1>$out_lexicon

echo "$0: Completed. Synthesized lexicon for new words is in $out_lexicon"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@huangruizhe Can you address Yenda's earlier comment: generating a list of failed words in a file and point it to the user in the echo message? The warning message from phonetisaurus is not consolidated into a file. So the user may miss it and want to find those words in a file. Actually I noticed your "out_lexicon_failed" is not used at all.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

exit 0
82 changes: 82 additions & 0 deletions egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/bash

# Copyright 2017 Intellisist, Inc. (Author: Navneeth K)
# 2017 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache License 2.0

# This script trains a g2p model using Phonetisaurus.

stage=0
encoding='utf-8'
only_words=true
silence_phones=

echo "$0 $@" # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 2 ]; then
echo "Usage: $0 [options] <lexicon-in> <work-dir>"
echo " where <lexicon-in> is the training lexicon (one pronunciation per "
echo " word per line, with lines like 'hello h uh l ow') and"
echo " <work-dir> is directory where the models will be stored"
echo "e.g.: $0 --silence-phones data/local/dict/silence_phones.txt data/local/dict/lexicon.txt exp/g2p/"
echo ""
echo "main options (for others, see top of script file)"
echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs."
echo " --silence-phones <silphones-list> # e.g. data/local/dict/silence_phones.txt."
echo " # A list of silence phones, one or more per line"
echo " # Relates to --only-words option"
echo " --only-words (true|false) (default: true) # If true, exclude silence words, i.e."
echo " # words with one or multiple phones which are all silence."
exit 1;
fi

lexicon=$1
wdir=$2

[ ! -f $lexicon ] && echo "Cannot find $lexicon" && exit

isuconv=`which uconv`
if [ -z $isuconv ]; then
echo "uconv was not found. You must install the icu4c package."
exit 1;
fi

mkdir -p $wdir


# For input lexicon, remove pronunciations containing non-utf-8-encodable characters,
# and optionally remove words that are mapped to a single silence phone from the lexicon.
if [ $stage -le 0 ]; then
if $only_words && [ ! -z "$silence_phones" ]; then
awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \
$silence_phones $lexicon | \
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
else
awk '{printf("%s\t",$1); for (i=2;i<NF;i++){printf("%s ",$i);} printf("%s\n",$NF);}' $lexicon | \
uconv -f "$encoding" -t "$encoding" -x Any-NFC - | awk 'NF > 0'> $wdir/lexicon_tab_separated.txt
fi
fi

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

check whether phonetisaurus is installed here. Please refer to /export/b19/xzhang/tedlium/s5_r2/steps/dict/train_g2p.sh also.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.

if [ $stage -le 1 ]; then
# Align lexicon stage. Lexicon is assumed to have first column tab separated
phonetisaurus-align --input=$wdir/lexicon_tab_separated.txt --ofile=${wdir}/aligned_lexicon.corpus || exit 1;
fi

if [ $stage -le 2 ]; then
# Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality.
./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa
fi

if [ $stage -le 3 ]; then
# Convert the arpa file to FST.
phonetisaurus-arpa2wfst --lm=${wdir}/aligned_lexicon.arpa --ofile=${wdir}/model.fst
fi

Loading