Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 96 additions & 28 deletions egs/wsj/s5/utils/lang/extend_lang.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash
# Copyright 2018 Johns Hopkins University (Author: Daniel Povey);
# 2019 Dongji Gao

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -18,6 +19,7 @@

# Begin configuration section.
sil_prob=0.5
silprob_file=
# end configuration section

echo "$0 $@" # Print the command line for logging
Expand All @@ -43,6 +45,7 @@ if [ $# -ne 3 ]; then
echo ""
echo "Options"
echo " --sil-prob <probability of silence> # default: 0.5 [must have 0 <= silprob < 1]"
echo " --silprob-file <file contains silence probability> # must be provided if lexicon is lexiconp_silprob.txt"
exit 1;
fi

Expand All @@ -52,6 +55,7 @@ dir=$3

[ -f path.sh ] && . ./path.sh


for f in $srcdir/phones.txt $lexicon; do
if [ ! -f $f ]; then
echo "$0: expected file $f to exist"
Expand Down Expand Up @@ -79,11 +83,26 @@ tmpdir=$dir/temp
rm -r $tmpdir 2>/dev/null
mkdir -p $tmpdir


# TODO: more checking.
if [ $(basename $lexicon) != lexiconp.txt ]; then
echo "$0: currently this script only supports the lexiconp.txt format; your lexicon"
echo " ... has to have that filename."
silprob=false

if [ $(basename $lexicon) == "lexiconp_silprob.txt" ]; then
silprob=true
if [ -z $silprob_file ] ; then
echo "silprob_file not provided, checking $srcdir"
if [ -f $srcdir/silprob.txt ]; then
silprob_file=$srcdir/silprob.txt
echo "silprob_file found in $srcdir"
else
echo "silprob_file not found in $srcdir" && exit 1;
fi
else
if [ ! -f $silprob_file ]; then
echo "$silprob_file does not exist" && exit 1;
fi
fi
elif [ $(basename $lexicon) != lexiconp.txt ]; then
echo "$0: currently this script only supports the lexiconp.txt or lexiconp_silprob.txt format;"
echo " ... your lexicon has to have that filename."
fi

# Get the list of extra words.
Expand All @@ -105,22 +124,45 @@ fi

if [ -f $dir/phones/word_boundary.txt ]; then
# was `if $position_dependent_phones; then..` in prepare_lang.sh
# TODO: add support for silprobs
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
if "$silprob"; then
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
$wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
< $lexicon > $tmpdir/lexiconp.txt || exit 1;
< $lexicon > $tmpdir/lexiconp_silprob.txt
else
perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die;
if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B ";
for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
< $lexicon > $tmpdir/lexiconp.txt || exit 1;
fi
else
cp $lexicon $tmpdir/lexiconp.txt
if "$silprob"; then
cp $lexicon $tempdir/lexiconp_silprob.txt
else
cp $lexicon $tmpdir/lexiconp.txt
fi
fi

# Check that there are no unseen phones in the lexicon.
if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
if "$silprob"; then
if ! utils/sym2int.pl -f 6- $srcdir/phones.txt $tmpdir/lexiconp_silprob.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
fi
else
if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then
echo "$0: it looks like there are unseen phones in your lexicon $lexicon"
exit 1
fi
fi

ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
if "$silprob"; then
ndisambig=$(utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt)
else
ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt)
fi

ndisambig=$[ndisambig+1] # Add one to disambiguate silence.

Expand All @@ -146,6 +188,15 @@ silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1;
echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \
exit 1;

if "$silprob"; then
# remove the silprob
cat $tmpdir/lexiconp_silprob.txt |\
awk '{
for(i=1; i<=NF; i++) {
if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print "";
}
}' > $tmpdir/lexiconp.txt
fi

# First remove pron-probs from the lexicon.
perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt
Expand All @@ -161,7 +212,7 @@ if [ -f $dir/phones/nonterminals.txt ]; then
for w in "#nonterm_begin" "#nonterm_end" $(cat $dir/phones/nonterminals.txt); do
echo $w $w # These are words without pronunciations, so leave those prons
# empty.
done >> $dir/phones/align_lexicon.txt
done >> $dir/phones/align_lexicon.txt
fi

# create phones/align_lexicon.int from phones/align_lexicon.txt
Expand All @@ -170,22 +221,39 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \

# Create the basic L.fst without disambiguation symbols, for use
# in training.

utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
$tmpdir/lexiconp.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
if "$silprob"; then
utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \
$tmpdir/lexiconp_silprob.txt $silprob_file | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
else
utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \
$tmpdir/lexiconp.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
fi


# and create the version that has disambiguation symbols.
utils/lang/make_lexicon_fst.py $grammar_opts \
--sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_disambig.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
if "$silprob"; then
utils/lang/make_lexicon_fst_silprob.py $grammar_opts \
--sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_silprob_disambig.txt $silprob_file | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
else
utils/lang/make_lexicon_fst.py $grammar_opts \
--sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \
$tmpdir/lexiconp_disambig.txt | \
fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
--keep_isymbols=false --keep_osymbols=false | \
fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
fi


echo "$(basename $0): validating output directory"
Expand Down
1 change: 1 addition & 0 deletions egs/wsj/s5/utils/prepare_extended_lang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ word_list= # if a word list (mapping words from the srcdict to IDs) is provided,
# we'll make sure the IDs of these words are kept as before.
# end configuration sections

echo "$0: warning: This sript is is now deprecated. You may want to use utils/lang/extend_lang.sh"
echo "$0 $@" # Print the command line for logging

. utils/parse_options.sh
Expand Down