diff --git a/egs/wsj/s5/utils/lang/extend_lang.sh b/egs/wsj/s5/utils/lang/extend_lang.sh index 57f8e3d7588..7602cb983de 100755 --- a/egs/wsj/s5/utils/lang/extend_lang.sh +++ b/egs/wsj/s5/utils/lang/extend_lang.sh @@ -1,5 +1,6 @@ #!/bin/bash # Copyright 2018 Johns Hopkins University (Author: Daniel Povey); +# 2019 Dongji Gao # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +19,7 @@ # Begin configuration section. sil_prob=0.5 +silprob_file= # end configuration section echo "$0 $@" # Print the command line for logging @@ -43,6 +45,7 @@ if [ $# -ne 3 ]; then echo "" echo "Options" echo " --sil-prob # default: 0.5 [must have 0 <= silprob < 1]" + echo " --silprob-file # must be provided if lexicon is lexiconp_silprob.txt" exit 1; fi @@ -52,6 +55,7 @@ dir=$3 [ -f path.sh ] && . ./path.sh + for f in $srcdir/phones.txt $lexicon; do if [ ! -f $f ]; then echo "$0: expected file $f to exist" @@ -79,11 +83,26 @@ tmpdir=$dir/temp rm -r $tmpdir 2>/dev/null mkdir -p $tmpdir - -# TODO: more checking. -if [ $(basename $lexicon) != lexiconp.txt ]; then - echo "$0: currently this script only supports the lexiconp.txt format; your lexicon" - echo " ... has to have that filename." +silprob=false + +if [ $(basename $lexicon) == "lexiconp_silprob.txt" ]; then + silprob=true + if [ -z $silprob_file ] ; then + echo "silprob_file not provided, checking $srcdir" + if [ -f $srcdir/silprob.txt ]; then + silprob_file=$srcdir/silprob.txt + echo "silprob_file found in $srcdir" + else + echo "silprob_file not found in $srcdir" && exit 1; + fi + else + if [ ! -f $silprob_file ]; then + echo "$silprob_file does not exist" && exit 1; + fi + fi +elif [ $(basename $lexicon) != lexiconp.txt ]; then + echo "$0: currently this script only supports the lexiconp.txt or lexiconp_silprob.txt format;" + echo " ... your lexicon has to have that filename." fi # Get the list of extra words. @@ -105,22 +124,45 @@ fi if [ -f $dir/phones/word_boundary.txt ]; then # was `if $position_dependent_phones; then..` in prepare_lang.sh - # TODO: add support for silprobs - perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; - if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; + if "$silprob"; then + perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; + $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; + if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } + else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ - < $lexicon > $tmpdir/lexiconp.txt || exit 1; + < $lexicon > $tmpdir/lexiconp_silprob.txt + else + perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; @A>0||die; + if(@A==1) { print "$w $p $A[0]_S\n"; } else { print "$w $p $A[0]_B "; + for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ + < $lexicon > $tmpdir/lexiconp.txt || exit 1; + fi else - cp $lexicon $tmpdir/lexiconp.txt + if "$silprob"; then + cp $lexicon $tempdir/lexiconp_silprob.txt + else + cp $lexicon $tmpdir/lexiconp.txt + fi fi # Check that there are no unseen phones in the lexicon. -if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then - echo "$0: it looks like there are unseen phones in your lexicon $lexicon" - exit 1 +if "$silprob"; then + if ! utils/sym2int.pl -f 6- $srcdir/phones.txt $tmpdir/lexiconp_silprob.txt >/dev/null; then + echo "$0: it looks like there are unseen phones in your lexicon $lexicon" + exit 1 + fi +else + if ! utils/sym2int.pl -f 3- $srcdir/phones.txt $tmpdir/lexiconp.txt >/dev/null; then + echo "$0: it looks like there are unseen phones in your lexicon $lexicon" + exit 1 + fi fi -ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) +if "$silprob"; then + ndisambig=$(utils/add_lex_disambig.pl --pron-probs --sil-probs $tmpdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob_disambig.txt) +else + ndisambig=$(utils/add_lex_disambig.pl --pron-probs $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt) +fi ndisambig=$[ndisambig+1] # Add one to disambiguate silence. @@ -146,6 +188,15 @@ silphone=`cat $srcdir/phones/optional_silence.txt` || exit 1; echo "but you may use the option --sil-prob 0.0 to stop it being used." ) && \ exit 1; +if "$silprob"; then + # remove the silprob + cat $tmpdir/lexiconp_silprob.txt |\ + awk '{ + for(i=1; i<=NF; i++) { + if(i!=3 && i!=4 && i!=5) printf("%s\t", $i); if(i==NF) print ""; + } + }' > $tmpdir/lexiconp.txt +fi # First remove pron-probs from the lexicon. perl -ape 's/(\S+\s+)\S+\s+(.+)/$1$2/;' <$tmpdir/lexiconp.txt >$tmpdir/align_lexicon.txt @@ -161,7 +212,7 @@ if [ -f $dir/phones/nonterminals.txt ]; then for w in "#nonterm_begin" "#nonterm_end" $(cat $dir/phones/nonterminals.txt); do echo $w $w # These are words without pronunciations, so leave those prons # empty. - done >> $dir/phones/align_lexicon.txt + done >> $dir/phones/align_lexicon.txt fi # create phones/align_lexicon.int from phones/align_lexicon.txt @@ -170,22 +221,39 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ # Create the basic L.fst without disambiguation symbols, for use # in training. - -utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \ - $tmpdir/lexiconp.txt | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; +if "$silprob"; then + utils/lang/make_lexicon_fst_silprob.py $grammar_opts --sil-phone=$silphone \ + $tmpdir/lexiconp_silprob.txt $silprob_file | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; +else + utils/lang/make_lexicon_fst.py $grammar_opts --sil-prob=$sil_prob --sil-phone=$silphone \ + $tmpdir/lexiconp.txt | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; +fi # and create the version that has disambiguation symbols. -utils/lang/make_lexicon_fst.py $grammar_opts \ - --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \ - $tmpdir/lexiconp_disambig.txt | \ - fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ - --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ - fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; +if "$silprob"; then + utils/lang/make_lexicon_fst_silprob.py $grammar_opts \ + --sil-phone=$silphone --sil-disambig='#'$ndisambig \ + $tmpdir/lexiconp_silprob_disambig.txt $silprob_file | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ + fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; +else + utils/lang/make_lexicon_fst.py $grammar_opts \ + --sil-prob=$sil_prob --sil-phone=$silphone --sil-disambig='#'$ndisambig \ + $tmpdir/lexiconp_disambig.txt | \ + fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ + fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; +fi echo "$(basename $0): validating output directory" diff --git a/egs/wsj/s5/utils/prepare_extended_lang.sh b/egs/wsj/s5/utils/prepare_extended_lang.sh index 824654cabf1..57cfcaabe34 100755 --- a/egs/wsj/s5/utils/prepare_extended_lang.sh +++ b/egs/wsj/s5/utils/prepare_extended_lang.sh @@ -24,6 +24,7 @@ word_list= # if a word list (mapping words from the srcdict to IDs) is provided, # we'll make sure the IDs of these words are kept as before. # end configuration sections +echo "$0: warning: This sript is is now deprecated. You may want to use utils/lang/extend_lang.sh" echo "$0 $@" # Print the command line for logging . utils/parse_options.sh