diff --git a/egs/sprakbanken/s5/local/copy_dict.sh b/egs/sprakbanken/s5/local/copy_dict.sh index c5cd1fc77b4..5ae5e9697b1 100755 --- a/egs/sprakbanken/s5/local/copy_dict.sh +++ b/egs/sprakbanken/s5/local/copy_dict.sh @@ -1,7 +1,8 @@ #!/bin/bash # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) -# Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal) +# Copyright 2014-15 Mirsk Digital ApS (Author: Andreas Kirkedal) +# Copyright 2016 Andreas Kirkedal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,9 +17,7 @@ # See the Apache 2 License for the specific language governing permissions and # limitations under the License. -KALDI_ROOT=$(pwd)/../../.. - -exproot=$(pwd) +lex=lexicon-da-nonorm.tgz dir=data/local/dict mkdir -p $dir @@ -31,22 +30,13 @@ cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt cp local/dictsrc/extra_questions.txt $dir/extra_questions.txt # Copy pre-made lexicon -wget http://www.openslr.org/resources/8/lexicon-da.tgz --directory-prefix=data/local/data/download -tar -xzf data/local/data/download/lexicon-da.tgz -C $dir +wget http://www.openslr.org/resources/8/$lex --directory-prefix=data/local/data/download +tar -xzf data/local/data/download/$lex -C $dir # silence phones, one per line. -echo SIL > $dir/silence_phones.txt +echo -e "SIL\nSPN" > $dir/silence_phones.txt echo SIL > $dir/optional_silence.txt - - - - -wait - - -## TODO: add cleanup commands - echo "Dictionary preparation succeeded" diff --git a/egs/sprakbanken/s5/local/create_datasets.sh b/egs/sprakbanken/s5/local/create_datasets.sh index b0d87a730e8..891771dbce1 100755 --- a/egs/sprakbanken/s5/local/create_datasets.sh +++ b/egs/sprakbanken/s5/local/create_datasets.sh @@ -24,7 +24,7 @@ fi src=$1 dest=$2 mkdir $dest -python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am +python local/normalize_transcript_prefixed.py local/norm_dk/numbersLow.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext paste -d ' ' $src/onlyids $src/onlytext > $dest/text for f in wav.scp utt2spk; do diff --git a/egs/sprakbanken/s5/local/dict_prep.sh b/egs/sprakbanken/s5/local/dict_prep.sh index 8ecfa028408..1e37460dbe5 100755 --- a/egs/sprakbanken/s5/local/dict_prep.sh +++ b/egs/sprakbanken/s5/local/dict_prep.sh @@ -2,6 +2,7 @@ # Copyright 2010-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Copyright 2014 Mirsk Digital ApS (Author: Andreas Kirkedal) +# Copyright 2014-2016 Andreas Kirkedal5D # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,57 +20,24 @@ KALDI_ROOT=$(pwd)/../../.. exproot=$(pwd) -dir=data/local/dict +lmdir=data/local/transcript_lm +dictsrc=data/local/dictsrc +dictdir=data/local/dict espeakdir='espeak-1.48.04-source' -mkdir -p $dir +mkdir -p $dictsrc $dictdir # Dictionary preparation: - -# Normalise transcripts and create a transcript file -# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') -# outputs a normalised transcript without utterance ids and a list of utterance ids -echo "Normalising" - -# Create dir to hold lm files and other non-standard files, useful for debugging -trainsrc=data/local/trainsrc -rm -rf $trainsrc -mkdir $trainsrc -mv data/train/text1 $trainsrc/text1 -python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp - -# Additional normalisation, uppercasing, writing numbers etc. -# and recombine with -local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am -cp $dir/transcripts.am $trainsrc/onlytext -paste $trainsrc/onlyids $trainsrc/onlytext > data/train/text -utils/validate_data_dir.sh --no-feat data/train || exit 1; - - - -# lmsents is output by sprak_data_prep.sh and contains -# sentences that are disjoint from the test and dev set -python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm -wait - # Create wordlist from the AM transcripts -cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt & - -# Because training data is read aloud, there are many occurences of the same -# sentence and bias towards the domain. Make a version where -# the sentences are unique to reduce bias. -local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt -sort -u $dir/transcripts.txt > $dir/transcripts.uniq - +cat $lmdir/transcripts.uniq | tr [:blank:] '\n' | sort -u > $dictsrc/wlist.txt & # Install eSpeak if it is not installed already - if hash espeak 2>/dev/null; - then +then echo 'eSpeak installed' - else - cd $KALDI_ROOT/tools || exit 1; +else + cd $KALDI_ROOT/tools || exit 1; wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip wait unzip -q $espeakdir.zip @@ -81,87 +49,60 @@ if hash espeak 2>/dev/null; cd $exproot || exit 1; fi - - # Wait for the wordlist to be fully created -wait - +wait # Run wordlist through espeak to get phonetics # improvised parallelisation - simple call because 'split' often has different versions -split -l 10000 $dir/wlist.txt $dir/Wtemp_ -for w in $dir/Wtemp_*; do - (cat $w | espeak -q -vda -x > $w.pho) & +split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_ +for w in $dictsrc/Wtemp_*; do + (cat $w | espeak -q -vda -x > $w.pho) & done wait -cat $dir/Wtemp_*.pho > $dir/plist.txt -rm -f $dir/Wtemp_* +cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt +rm -f $dictsrc/Wtemp_* # Filter transcription -# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove +# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove # initial and trailing spaces and collapse 2 or more spaces to one space -cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt +cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt #Some question marks are not caught above -perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt +perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt # Create lexicon.txt and put it in data/local/dict -paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt +paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt # Remove entries without transcription -grep -P "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt +grep -P "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt # Copy pre-made phone table with -cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt +cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt # Add "!SIL SIL" to lexicon.txt -echo -e '!SIL\tSIL' > $dir/lex_first -echo -e '\tSPN' >> $dir/lex_first -cat $dir/lexicon2.txt >> $dir/lex_first -mv $dir/lex_first $dir/lexicon.txt +echo -e '!SIL\tSIL' > $dictsrc/lex_first +echo -e '\tSPN' >> $dictsrc/lex_first +cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first +mv $dictsrc/lex_first $dictdir/lexicon.txt # silence phones, one per line. -echo SIL > $dir/silence_phones.txt -echo SIL > $dir/optional_silence.txt - -touch $dir/extra_questions.txt - -# Repeat text preparation on test set, but do not add to dictionary -# Create dir to hold lm files and other non-standard files -testsrc=data/local/testsrc -rm -rf $testsrc -mkdir $testsrc -mv data/test/text1 $testsrc/text1 -python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am -local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext -paste $testsrc/onlyids $testsrc/onlytext > data/test/text -utils/validate_data_dir.sh --no-feat data/test || exit 1; - -# Repeat text preparation on dev set, but do not add to dictionary -# Create dir to hold lm files and other non-standard files -devsrc=data/local/devsrc -rm -rf $devsrc -mkdir $devsrc -mv data/dev/text1 $devsrc/text1 -python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp -local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext -paste $devsrc/onlyids $devsrc/onlytext > data/dev/text & - -# Also create a file that can be used for reranking using text features -local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt -sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq - - -utils/validate_data_dir.sh --no-feat data/dev || exit 1; +if [ ! -f $dictdir/silence_phones.txt ]; then + echo SIL > $dictdir/silence_phones.txt +fi +if [ ! -f $dictdir/optional_silence.txt ]; then + echo SIL > $dictdir/optional_silence.txt +fi -## TODO: add cleanup commands +if [ ! -f $dictdir/extra_questions.txt ]; then + touch $dictdir/extra_questions.txt +fi -echo "Normalisation and dictionary preparation succeeded" +echo "Dictionary preparation succeeded" diff --git a/egs/sprakbanken/s5/local/norm_dk/format_text.sh b/egs/sprakbanken/s5/local/norm_dk/format_text.sh index ff85c8cc0ef..abbf975dbdf 100755 --- a/egs/sprakbanken/s5/local/norm_dk/format_text.sh +++ b/egs/sprakbanken/s5/local/norm_dk/format_text.sh @@ -34,8 +34,8 @@ nonum=$tmp/nonum.tmp cat $2 | tr -d '\r' > $src -$dir/expand_abbr_medical.sh $src > $abbr; -$dir/remove_annotation.sh $abbr > $rem; +#$dir/expand_abbr_medical.sh $src > $abbr; +$dir/remove_annotation.sh $src > $rem; if [ $mode != "am" ]; then $dir/sent_split.sh $rem > $line; else @@ -45,10 +45,11 @@ fi $dir/expand_dates.sh $line |\ $dir/format_punct.sh > $num; #python3 $dir/writenumbers.py $dir/numbersUp.tbl $num $nonum; -cat $num | $dir/write_punct.sh | \ +# $dir/write_punct.sh | \ +cat $num | \ perl -pi -e "s/^\n//" | \ -perl -pe 's/ (.{4}.*?)\./ \1/g' | \ -PERLIO=:utf8 perl -pe '$_=uc' +perl -pe 's/ (.{4}.*?)\./ \1/g' +# | PERLIO=:utf8 perl -pe '$_=lc' # Comment this line for debugging wait diff --git a/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl b/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl new file mode 100644 index 00000000000..824c0afa3b2 --- /dev/null +++ b/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl @@ -0,0 +1,265 @@ +¼ en fjerdedel +½ en halv +0 nul +² i anden +enogfirs en og firs +enogfyrre en og fyrre +enoghalvfems en og halvfems +enoghalvfjerds en og halvfjerds +enoghalvtreds en og halvtreds +enogtredive en og tredive +enogtredivte en og tredivte +enogtres en og tres +enogtyvende en og tyvende +femogfirs fem og firs +femogfyrre fem og fyrre +femoghalvfems fem og halvfems +femoghalvfjerds fem og halvfjerds +femoghalvtreds fem og halvtreds +femogtredive fem og tredive +femogtres fem og tres +femogtyve fem og tyve +femogtyvende fem og tyvende +fireogfirs fire og firs +fireogfyrre fire og fyrre +fireoghalvfems fire og halvfems +fireoghalvfjerds fire og halvfjerds +fireoghalvtreds fire og halvtreds +fireogtredive fire og tredive +fireogtres fire og tres +fireogtyve fire og tyve +fireogtyvende fire og tyvende +fyrreogtyvende fyrre og tyvende +niogfirs ni og firs +niogfyrre ni og fyrre +nioghalvfems ni og halvfems +nioghalvfjerds ni og halvfjerds +nioghalvtreds ni og halvtreds +niogtredive ni og tredive +niogtres ni og tres +niogtyvende ni og tyvende +niogtyve ni og tyve +otteogfirs otte og firs +otteogfyrre otte og fyrre +otteoghalvfems otte og halvfems +otteoghalvfjerds otte og halvfjerds +otteoghalvtreds otte og halvtreds +otteogtredive otte og tredive +otteogtres otte og tres +otteogtyvende otte og tyvende +otteogtyve otte og tyve +seksogfirs seks og firs +seksogfyrre seks og fyrre +seksoghalvfems seks og halvfems +seksoghalvfjerds seks og halvfjerds +seksoghalvtreds seks og halvtreds +seksogtredive seks og tredive +seksogtres seks og tres +seksogtyvende seks og tyvende +seksogtyve seks og tyve +syvogfirs syv og firs +syvogfyrre syv og fyrre +syvoghalvfems syv og halvfems +syvoghalvfjerds syv og halvfjerds +syvoghalvtreds syv og halvtreds +syvogtredive syv og tredive +syvogtres syv og tres +syvogtyvende syv og tyvende +syvogtyve syv og tyve +toogfirs to og firs +toogfyrre to og fyrre +tooghalvfems to og halvfems +tooghalvfjerds to og halvfjerds +tooghalvtreds to og halvtreds +toogtredive to og tredive +toogtres to og tres +toogtyvende to og tyvende +toogtyve to og tyve +totusindogatten to tusind og atten +totusindogelleve to tusind og elleve +totusindoget to tusind og et +totusindogfemten to tusind og femten +totusindogfem to tusind og fem +totusindogfire to tusind og fire +totusindogfjorten to tusind og fjorten +totusindogni to tusind og ni +totusindognitten to tusind og nitten +totusindogotte to tusind og otte +totusindogseksten to tusind og seksten +totusindogseks to tusind og seks +totusindogsytten to tusind og sytten +totusindogsyv to tusind og syv +totusindogti to tusind og ti +totusindogtolv to tusind og tolv +totusindogto to tusind og to +totusindogtre to tusind og tre +totusindogtretten to tusind og tretten +totusindogtyve to tusind og tyve +treogfirs tre og firs +treogfyrre tre og fyrre +treoghalvfems tre og halvfems +treoghalvfjerds tre og halvfjerds +treoghalvtreds tre og halvtreds +treogtredive tre og tredive +treogtres tre og tres +treogtyvende tre og tyvende +treogtyve tre og tyve +1 en +1. første +2. anden +2 to +3 tre +3. tredje +4 fire +4. fjerde +5 fem +5. femte +6 seks +6. sjette +7 syv +7. syvende +8 otte +8. ottende +9 ni +9. niende +10 ti +10. tiende +11 elleve +11. ellevte +12 tolv +12. tolvte +13 tretten +13. trettende +14 fjorten +14. fjortende +15 femten +15. femtende +16 seksten +16. sekstende +17 sytten +17. syttende +18 atten +18. attende +19 nitten +19. nittende +20 tyve +20. tyvende +21 en og tyve +21. en og tyvende +22 to og tyve +22. to og tyvende +23 tre og tyve +23. tre og tyvende +24 fire og tyve +24. fire og tyvende +25 fem og tyve +25. fem og tyvende +26 seks og tyve +26. seks og tyvende +27 syv og tyve +27. syv og tyvende +28 otte og tyve +28. otte og tyvende +29 ni og tyve +29. ni og tyvende +30 tredive +30. tredivte +31 en og tredive +31. en og tredivte +32 to og tredive +33 tre og tredive +34 fire og tredive +35 fem og tredive +36 seks og tredive +37 syv og tredive +38 otte og tredive +39 ni og tredive +40 fyrre +40. fyrre og tyvende +41 en og fyrre +42 to og fyrre +43 tre og fyrre +44 fire og fyrre +45 fem og fyrre +46 seks og fyrre +47 syv og fyrre +48 otte og fyrre +49 ni og fyrre +50 halvtreds +50. halvtredsinds tyvende +51 en og halvtreds +52 to og halvtreds +53 tre og halvtreds +54 fire og halvtreds +55 fem og halvtreds +56 seks og halvtreds +57 syv og halvtreds +58 otte og halvtreds +59 ni og halvtreds +60 tres +60. tresinds tyvende +61 en og tres +62 to og tres +63 tre og tres +64 fire og tres +65 fem og tres +66 seks og tres +67 syv og tres +68 otte og tres +69 ni og tres +70 halvfjerds +70. halvfjerdsinds tyvende +71 en og halvfjerds +72 to og halvfjerds +73 tre og halvfjerds +74 fire og halvfjerds +75 fem og halvfjerds +76 seks og halvfjerds +77 syv og halvfjerds +78 otte og halvfjerds +79 ni og halvfjerds +80 firs +80. firsindstyvende +81 en og firs +82 to og firs +83 tre og firs +84 fire og firs +85 fem og firs +86 seks og firs +87 syv og firs +88 otte og firs +89 ni og firs +90 halvfems +90. halvfemsinds tyvende enogtyve en og tyve +91 en og halvfems +92 to og halvfems +93 tre og halvfems +94 fire og halvfems +95 fem og halvfems +96 seks og halvfems +97 syv og halvfems +98 otte og halvfems +99 ni og halvfems +100 hundrede +1000 tusind +2000 to tusind +2001 to tusind og et +2002 to tusind og to +2003 to tusind og tre +2004 to tusind og fire +2005 to tusind og fem +2006 to tusind og seks +2007 to tusind og syv +2008 to tusind og otte +2009 to tusind og ni +2010 to tusind og ti +2011 to tusind og elleve +2012 to tusind og tolv +2013 to tusind og tretten +2014 to tusind og fjorten +2015 to tusind og femten +2016 to tusind og seksten +2017 to tusind og sytten +2018 to tusind og atten +2019 to tusind og nitten +2020 to tusind og tyve diff --git a/egs/sprakbanken/s5/local/normalize_transcript.py b/egs/sprakbanken/s5/local/normalize_transcript.py index f759a39731d..2374418bee7 100755 --- a/egs/sprakbanken/s5/local/normalize_transcript.py +++ b/egs/sprakbanken/s5/local/normalize_transcript.py @@ -1,9 +1,10 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- import codecs import sys import re import writenumbers - +from string import maketrans ## Global vars @@ -16,7 +17,10 @@ "\t": " " } -t_table = str.maketrans(normdict) +from_chars = ''.join(normdict.keys()) +to_chars = ''.join(normdict.values()) + +#t_table = maketrans(from_chars, to_chars) ## Main @@ -27,10 +31,11 @@ for line in transcript: - normtext1 = line.translate(t_table) - normtext2 = re.sub(r' +', ' ', normtext1.strip()) - normtext3 = writenumbers.normNumber(normtext2, numtable) - outtext.write(normtext3.upper() + "\n") + normtext1 = re.sub(r'[\.,:;\?]', '', line) + normtext2 = re.sub(r'[\t\\]', ' ', normtext1) + normtext3 = re.sub(r' +', ' ', normtext2.strip()) + normtext4 = writenumbers.normNumber(normtext3, numtable) + outtext.write(normtext4) transcript.close() outtext.close() diff --git a/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py b/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py index e934533a393..557606ae205 100755 --- a/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py +++ b/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py @@ -1,6 +1,7 @@ #!/usr/bin/env python ''' # Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal) +# Copyright 2014-2016 Andreas Kirkedal # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,16 +25,16 @@ ## Global vars -normdict = {",": " ", - ":": " ", - ";": " ", - "?": " ", - "\\": " ", - "\t": " ", - #".": "" - } +# normdict = {",": " ", +# ":": " ", +# ";": " ", +# "?": " ", +# "\\": " ", +# "\t": " ", +# #".": "" +# } -t_table = str.maketrans(normdict) +# t_table = str.maketrans(normdict) ## Utility function @@ -51,12 +52,13 @@ def getuttid_text(line): for line in textin: utt_id, text = getuttid_text(line) - normtext1 = text.translate(t_table) - normtext2 = re.sub(r' +', ' ', normtext1.strip()) - normtext3 = writenumbers.normNumber(normtext2, numtable) - + normtext1 = re.sub(r'[\.,:;\?]', '', text) + normtext2 = re.sub(r'[\t\\]', ' ', normtext1) + normtext3 = re.sub(r' +', ' ', normtext2.strip()) + normtext4 = writenumbers.normNumber(normtext3, numtable) + outtext.write(normtext4) fid.write(utt_id + "\n") - outtext.write(normtext3) + textin.close() outtext.close() diff --git a/egs/sprakbanken/s5/local/score.sh b/egs/sprakbanken/s5/local/score.sh index abd8149a672..9fcafdc0b5c 100755 --- a/egs/sprakbanken/s5/local/score.sh +++ b/egs/sprakbanken/s5/local/score.sh @@ -1,18 +1,24 @@ #!/bin/bash -# Copyright 2012 Johns Hopkins University (Author: Daniel Povey) +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) # Apache 2.0 +# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER + [ -f ./path.sh ] && . ./path.sh # begin configuration section. cmd=run.pl stage=0 -decode_mbr=true -word_ins_penalty=0.0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 min_lmwt=7 max_lmwt=17 +iter=final #end configuration section. +echo "$0 $@" # Print the command line for logging [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; @@ -37,21 +43,107 @@ for f in $symtab $dir/lat.1.gz $data/text; do [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; done -mkdir -p $dir/scoring/log -cat $data/text | sed 's:::g' | sed 's:::g' > $dir/scoring/test_filt.txt +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \ - lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ - lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \ - lattice-best-path --word-symbol-table=$symtab \ - ark:- ark,t:$dir/scoring/LMWT.tra || exit 1; -# Note: the double level of quoting for the sed command -$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \ - cat $dir/scoring/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT || exit 1; + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null exit 0; diff --git a/egs/sprakbanken/s5/local/sprak_data_prep.sh b/egs/sprakbanken/s5/local/sprak_data_prep.sh index c7a1d048a4f..1b2406620f2 100755 --- a/egs/sprakbanken/s5/local/sprak_data_prep.sh +++ b/egs/sprakbanken/s5/local/sprak_data_prep.sh @@ -2,6 +2,7 @@ # Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) # Copyright 2013-2014 Mirsk Digital Aps (Author: Andreas Kirkedal) +# Copyright 2015-2016 Andreas Kirkedal # Apache 2.0. @@ -21,12 +22,12 @@ utils=`pwd`/utils # This recipe currently relies on version 3 because python3 uses utf8 as internal # string representation -if ! which python3 >&/dev/null; then - echo "Installing python3 since not on your path." - pushd $KALDI_ROOT/tools || exit 1; - extras/install_python3.sh || exit 1; - popd -fi +#if ! which python3 >&/dev/null; then +# echo "Installing python3 since not on your path." +# pushd $KALDI_ROOT/tools || exit 1; +# extras/install_python3.sh || exit 1; +# popd +#fi if [ ! -d $dir/download ]; then mkdir -p $dir/download/0565-1 $dir/download/0565-2 @@ -35,15 +36,15 @@ fi echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while." if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download ) & + ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download ) fi if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download ) & + ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download ) fi -if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then - ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download ) & +if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then + ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download ) fi wait @@ -51,8 +52,8 @@ echo "Corpus files downloaded." if [ ! -d $dir/download/0611 ]; then echo "Unpacking files." - tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1 & - tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 & + tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1 + tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 tar -xzf $dir/download/da.16kHz.0611.tar.gz -C $dir/download # Note: rename "da 0611 test" to "da_0611_test" for this to work @@ -62,7 +63,7 @@ if [ ! -d $dir/download/0611 ]; then fi -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe +sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe if [ ! -x $sph2pipe ]; then echo "Could not find (or execute) the sph2pipe program at $sph2pipe"; exit 1; @@ -78,27 +79,25 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05 # Create parallel file lists and text files, but keep sound files in the same location to save disk space # Writes the lists to data/local/data (~ 310h) echo "Creating parallel data for training data." -python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 & # ~130h -python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 & # ~115h -python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h +python $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 # ~130h +python $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 # ~115h +python $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 # ~51h ( # Ditto dev set (~ 16h) echo "Creating parallel data for test data." rm -rf $dir/corpus_processed/dev03 mkdir -p $dir/corpus_processed/dev03 - python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 & -) & + python $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 || exit 1; +) ( # Ditto test set (about 9 hours) echo "Creating parallel data for development data." rm -rf $dir/corpus_processed/test06 mkdir -p $dir/corpus_processed/test06 - python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1; -) & - -wait + python $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1; +) # Create the LM training data # Test and dev data is disjoint from training data, so we use those transcripts) @@ -110,10 +109,10 @@ wait ( echo "Writing the LM text to file and normalising." cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents - python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm + python local/normalize_transcript.py local/norm_dk/numbersLow.tbl $lmdir/lmsents $lmdir/lmsents.norm local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq -) & +) # Combine training file lists echo "Combine file lists." @@ -131,18 +130,15 @@ cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with # Use sph2pipe because the wav files are actually sph files echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" -python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe & -python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe & -python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe & +python $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe +python $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe +python $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe -wait # Create the main data sets -local/create_datasets.sh $testdir data/test & -local/create_datasets.sh $devdir data/dev & -local/create_datasets.sh $traindir data/train & - -wait +local/create_datasets.sh $testdir data/test +local/create_datasets.sh $devdir data/dev +local/create_datasets.sh $traindir data/train ## TODO diff --git a/egs/sprakbanken/s5/local/wer_hyp_filter b/egs/sprakbanken/s5/local/wer_hyp_filter new file mode 100755 index 00000000000..8ecbdd3ec04 --- /dev/null +++ b/egs/sprakbanken/s5/local/wer_hyp_filter @@ -0,0 +1,5 @@ +#!/bin/bash + +perl -C -pe 's:::g; s:::g; s:::g' | \ +perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \ +PERLIO=:utf8 perl -pe '$_=lc' diff --git a/egs/sprakbanken/s5/local/wer_output_filter b/egs/sprakbanken/s5/local/wer_output_filter new file mode 100755 index 00000000000..8ecbdd3ec04 --- /dev/null +++ b/egs/sprakbanken/s5/local/wer_output_filter @@ -0,0 +1,5 @@ +#!/bin/bash + +perl -C -pe 's:::g; s:::g; s:::g' | \ +perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \ +PERLIO=:utf8 perl -pe '$_=lc' diff --git a/egs/sprakbanken/s5/local/wer_ref_filter b/egs/sprakbanken/s5/local/wer_ref_filter new file mode 100755 index 00000000000..8ecbdd3ec04 --- /dev/null +++ b/egs/sprakbanken/s5/local/wer_ref_filter @@ -0,0 +1,5 @@ +#!/bin/bash + +perl -C -pe 's:::g; s:::g; s:::g' | \ +perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \ +PERLIO=:utf8 perl -pe '$_=lc' diff --git a/egs/sprakbanken/s5/local/writenumbers.py b/egs/sprakbanken/s5/local/writenumbers.py index 452cd3e7e9c..df3235243d4 100755 --- a/egs/sprakbanken/s5/local/writenumbers.py +++ b/egs/sprakbanken/s5/local/writenumbers.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- ''' # Copyright 2014 Author: Andreas Kirkedal diff --git a/egs/sprakbanken/s5/run.sh b/egs/sprakbanken/s5/run.sh index 34c1f18d964..53fd7b1484e 100755 --- a/egs/sprakbanken/s5/run.sh +++ b/egs/sprakbanken/s5/run.sh @@ -4,198 +4,139 @@ ## This relates to the queue. . ./path.sh # so python3 is on the path if not on the system (we made a link to utils/).a -# This is a shell script, but it's recommended that you run the commands one by -# one by copying and pasting into the shell. - - -# Download the corpus and prepare parallel lists of sound files and text files -# Divide the corpus into train, dev and test sets -local/sprak_data_prep.sh || exit 1; - -# Perform text normalisation, prepare dict folder and LM data transcriptions -# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh -#local/dict_prep.sh || exit 1; -local/copy_dict.sh || exit 1; - - -utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; - -# Now make MFCC features. -# mfccdir should be some place with a largish disk where you -# want to store MFCC features. -mfccdir=mfcc - - -# Extract mfccs -# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some -# wave files are corrupt -# Will return a warning message because of the corrupt audio files, but compute them anyway -# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh - -steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc & -steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc & -steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1; -wait - -# Compute cepstral mean and variance normalisation -steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc & -steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc & -steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc - -wait - -# Repair data set (remove corrupt data points with corrupt audio) - -utils/fix_data_dir.sh data/test & -utils/fix_data_dir.sh data/dev & -utils/fix_data_dir.sh data/train -wait - -# Train LM with CMUCLMTK -# This setup uses IRSTLM -#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log - -# Train LM with irstlm -local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log & -local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log - -# Make subset with 1k utterances for rapid testing -# Randomly selects 980 utterances from 7 speakers -utils/subset_data_dir.sh --per-spk data/test 140 data/test1k & - -# Now make subset of the training data with the shortest 120k utterances. -utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1; - -# Train monophone model on short utterances -steps/train_mono.sh --nj 30 --cmd "$train_cmd" \ - data/train_120kshort data/lang exp/mono0a || exit 1; - -# Ensure that LMs are created -wait - -utils/mkgraph.sh data/lang_test_3g exp/mono0a exp/mono0a/graph_3g & -utils/mkgraph.sh data/lang_test_4g exp/mono0a exp/mono0a/graph_4g & - -# Ensure that all graphs are constructed -wait - -steps/decode.sh --nj 7 --cmd "$decode_cmd" \ - exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k - -# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \ -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; - -# steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ -steps/train_deltas.sh --cmd "$train_cmd" \ - 2000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1; - -wait - - -utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g & -utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1; - -( -steps/decode.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1; -) & - -( -steps/decode.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1; -) & - -wait - -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/tri1 exp/tri1_ali || exit 1; - - -# Train tri2a, which is deltas + delta-deltas. -steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 15000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; - -utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1; - -steps/decode.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1; - - -steps/train_lda_mllt.sh --cmd "$train_cmd" \ - --splice-opts "--left-context=5 --right-context=5" \ - 2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1; - -utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1; -steps/decode.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1; - - -steps/align_si.sh --nj 30 --cmd "$train_cmd" \ - --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; - -wait - - -# From 2b system, train 3b which is LDA + MLLT + SAT. -steps/train_sat.sh --cmd "$train_cmd" \ - 2500 15000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; -utils/mkgraph.sh data/lang_test_3g exp/tri3b exp/tri3b/graph_3g || exit 1; -steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri3b/graph_3g data/test1k exp/tri3b/decode_3g_test1k || exit 1; - - -# Trying 4-gram language model -utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1; - -steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \ - exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1; - -# This is commented out for now as it's not important for the main recipe. -## Train RNN for reranking -#local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k -## Consumes a lot of memory! Do not run in parallel -#local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k - - -# From 3b system -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ +nj=12 + +stage=0 +. utils/parse_options.sh + +if [ $stage -le 0 ]; then + # Download the corpus and prepare parallel lists of sound files and text files + # Divide the corpus into train, dev and test sets + local/sprak_data_prep.sh || exit 1; +fi + +if [ $stage -le 1 ]; then + # Perform text normalisation, prepare dict folder and LM data transcriptions + # This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh + # local/dict_prep.sh || exit 1; + local/copy_dict.sh || exit 1; +fi + +if [ $stage -le 2 ]; then + utils/prepare_lang.sh data/local/dict "" data/local/lang_tmp data/lang || exit 1; +fi + +if [ $stage -le 3 ]; then + # Extract mfccs + # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some + # wave files are corrupt + # Will return a warning message because of the corrupt audio files, but compute them anyway + # If this step fails and prints a partial diff, rerun from sprak_data_prep.sh + for dataset in train test dev; do + steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$dataset || exit 1; + + # Compute cepstral mean and variance normalisation + steps/compute_cmvn_stats.sh data/$dataset || exit 1; + + # Repair data set (remove corrupt data points with corrupt audio) + utils/fix_data_dir.sh data/$dataset || exit 1; + + done + # Make a subset of the training data with the shortest 120k utterances. + utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1; +fi + +if [ $stage -le 4 ]; then + # Train LM with irstlm + local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "tg" data/lang data/local/train3_lm &> data/local/tg.log || exit 1; + local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "fg" data/lang data/local/train4_lm &> data/local/fg.log || exit 1; +fi + +if [ $stage -le 5 ]; then + # Train monophone model on short utterances + steps/train_mono.sh --nj $nj --cmd "$train_cmd" \ + data/train_120kshort data/lang exp/mono0a || exit 1; + utils/mkgraph.sh --mono data/lang_test_tg exp/mono0a exp/mono0a/graph_tg || exit 1; + steps/decode.sh --nj 12 --cmd "$decode_cmd" \ + exp/mono0a/graph_tg data/dev exp/mono0a/decode_tg_dev || exit 1; +fi + +if [ $stage -le 6 ]; then + # Train tri1 (delta+delta-delta) + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/mono0a exp/mono0a_ali || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 3000 40000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1; + + # Decode dev set with both LMs + utils/mkgraph.sh data/lang_test_tg exp/tri1 exp/tri1/graph_tg || exit 1; + utils/mkgraph.sh data/lang_test_fg exp/tri1 exp/tri1/graph_fg || exit 1; + steps/decode.sh --nj 12 --cmd "$decode_cmd" \ + exp/tri1/graph_fg data/dev exp/tri1/decode_fg_dev || exit 1; + steps/decode.sh --nj 12 --cmd "$decode_cmd" \ + exp/tri1/graph_tg data/dev exp/tri1/decode_tg_dev || exit 1; +fi + +if [ $stage -le 7 ]; then + # Train tri2a (delta + delta-delta) + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri1 exp/tri1_ali || exit 1; + steps/train_deltas.sh --cmd "$train_cmd" \ + 5000 60000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; + utils/mkgraph.sh data/lang_test_tg exp/tri2a exp/tri2a/graph_tg || exit 1; + steps/decode.sh --nj 12 --cmd "$decode_cmd" \ + exp/tri2a/graph_tg data/dev exp/tri2a/decode_tg_dev || exit 1; +fi + +if [ $stage -le 8 ]; then + # Train tri2b (LDA+MLLT) + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; + steps/train_lda_mllt.sh --cmd "$train_cmd" \ + --splice-opts "--left-context=5 --right-context=5" \ + 6500 75000 data/train data/lang exp/tri2a_ali exp/tri2b || exit 1; + utils/mkgraph.sh data/lang_test_tg exp/tri2b exp/tri2b/graph_tg || exit 1; + steps/decode.sh --nj 12 --cmd "$decode_cmd" \ + exp/tri2b/graph_tg data/dev exp/tri2b/decode_tg_dev || exit 1; +fi + +if [ $stage -le 9 ]; then + # From 2b system, train 3b which is LDA + MLLT + SAT. + steps/align_si.sh --nj $nj --cmd "$train_cmd" \ + --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; + steps/train_sat.sh --cmd "$train_cmd" \ + 7500 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; + + # Decode dev with 4gram and 3gram LMs + utils/mkgraph.sh data/lang_test_tg exp/tri3b exp/tri3b/graph_tg || exit 1; + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \ + exp/tri3b/graph_tg data/dev exp/tri3b/decode_tg_dev || exit 1; + utils/mkgraph.sh data/lang_test_fg exp/tri3b exp/tri3b/graph_fg || exit 1; + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \ + exp/tri3b/graph_fg data/dev exp/tri3b/decode_fg_dev || exit 1; + + # Decode test with 4gram and 3gram LMs + # there are fewer speaker (n=7) and decoding usually ends up waiting + # for a single job so we use --num-threads 2 to speed up + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \ + exp/tri3b/graph_tg data/test exp/tri3b/decode_tg_test || exit 1; + steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \ + exp/tri3b/graph_fg data/test exp/tri3b/decode_fg_test || exit 1; +fi + +if [ $stage -le 10 ]; then +# Alignment used to train nnets and sgmms +steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; +fi -# From 3b system, train another SAT system (tri4a) with all the si284 data. - -steps/train_sat.sh --cmd "$train_cmd" \ - 4200 40000 data/train data/lang exp/tri3b_ali exp/tri4a || exit 1; - -utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1; -steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1; - - -steps/train_quick.sh --cmd "$train_cmd" \ - 4200 40000 data/train data/lang exp/tri3b_ali exp/tri4b || exit 1; - -( - utils/mkgraph.sh data/lang_test_3g exp/tri4b exp/tri4b/graph_3g || exit 1; - steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri4b/graph_3g data/test1k exp/tri4b/decode_3g_test1k || exit 1; -) & - - utils/mkgraph.sh data/lang_test_4g exp/tri4b exp/tri4b/graph_4g || exit 1; - steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \ - exp/tri4b/graph_4g data/test1k exp/tri4b/decode_4g_test1k || exit 1; - -wait - -# alignment used to train nnets and sgmms -steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ - data/train data/lang exp/tri4b exp/tri4b_ali || exit 1; +##TODO: Add nnet3 and chain setups ## Works -local/sprak_run_nnet_cpu.sh 3g test1k +#local/sprak_run_nnet_cpu.sh tg dev ## Works -local/sprak_run_sgmm2.sh test1k +#local/sprak_run_sgmm2.sh dev # Getting results [see RESULTS file]