kaldi-asr · danpovey · Dec 15, 2016 · Nov 10, 2016 · Nov 10, 2016 · Nov 17, 2016
diff --git a/egs/sprakbanken/s5/local/copy_dict.sh b/egs/sprakbanken/s5/local/copy_dict.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 
 # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-# Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2014-15 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2016 Andreas Kirkedal
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +17,7 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
-KALDI_ROOT=$(pwd)/../../..
-
-exproot=$(pwd)
+lex=lexicon-da-nonorm.tgz
 dir=data/local/dict
 mkdir -p $dir
 
@@ -31,22 +30,13 @@ cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
 cp local/dictsrc/extra_questions.txt $dir/extra_questions.txt
 
 # Copy pre-made lexicon
-wget http://www.openslr.org/resources/8/lexicon-da.tgz --directory-prefix=data/local/data/download
-tar -xzf data/local/data/download/lexicon-da.tgz -C $dir
+wget http://www.openslr.org/resources/8/$lex --directory-prefix=data/local/data/download
+tar -xzf data/local/data/download/$lex -C $dir
 
 
 # silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
+echo -e "SIL\nSPN" > $dir/silence_phones.txt
 echo SIL > $dir/optional_silence.txt
 
-
-
-
-
-wait
-
-
-## TODO: add cleanup commands
-
 echo "Dictionary preparation succeeded"
 
diff --git a/egs/sprakbanken/s5/local/create_datasets.sh b/egs/sprakbanken/s5/local/create_datasets.sh
@@ -24,7 +24,7 @@ fi
 src=$1
 dest=$2
 mkdir $dest
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
+python local/normalize_transcript_prefixed.py local/norm_dk/numbersLow.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
 local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
 paste -d ' ' $src/onlyids $src/onlytext > $dest/text
 for f in wav.scp utt2spk; do

diff --git a/egs/sprakbanken/s5/local/dict_prep.sh b/egs/sprakbanken/s5/local/dict_prep.sh
@@ -2,6 +2,7 @@
 
 # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2014-2016 Andreas Kirkedal5D
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,57 +20,24 @@
 KALDI_ROOT=$(pwd)/../../..
 
 exproot=$(pwd)
-dir=data/local/dict
+lmdir=data/local/transcript_lm
+dictsrc=data/local/dictsrc
+dictdir=data/local/dict
 espeakdir='espeak-1.48.04-source'
-mkdir -p $dir
+mkdir -p $dictsrc $dictdir
 
 
 # Dictionary preparation:
 
-
-# Normalise transcripts and create a transcript file
-# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') 
-# outputs a normalised transcript without utterance ids and a list of utterance ids 
-echo "Normalising"
-
-# Create dir to hold lm files and other non-standard files, useful for debugging
-trainsrc=data/local/trainsrc
-rm -rf $trainsrc
-mkdir $trainsrc
-mv data/train/text1 $trainsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
-
-# Additional normalisation, uppercasing, writing numbers etc.
-# and recombine with 
-local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
-cp $dir/transcripts.am $trainsrc/onlytext
-paste $trainsrc/onlyids $trainsrc/onlytext > data/train/text 
-utils/validate_data_dir.sh --no-feat data/train || exit 1;
-
-
-
-# lmsents is output by sprak_data_prep.sh and contains
-# sentences that are disjoint from the test and dev set 
-python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
-wait
-
 # Create wordlist from the AM transcripts
-cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
-
-# Because training data is read aloud, there are many occurences of the same
-# sentence and bias towards the domain. Make a version where  
-# the sentences are unique to reduce bias.
-local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
-sort -u $dir/transcripts.txt > $dir/transcripts.uniq
-
+cat $lmdir/transcripts.uniq | tr [:blank:] '\n' | sort -u > $dictsrc/wlist.txt &
 
 # Install eSpeak if it is not installed already
-
 if hash espeak 2>/dev/null;
-  then
+then
     echo 'eSpeak installed'
-  else
-    cd $KALDI_ROOT/tools || exit 1; 
+else
+    cd $KALDI_ROOT/tools || exit 1;
     wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip
     wait
     unzip -q $espeakdir.zip
@@ -81,87 +49,60 @@ if hash espeak 2>/dev/null;
     cd $exproot || exit 1;
 fi
 
-
-
 # Wait for the wordlist to be fully created
-wait 
-
+wait
 
 # Run wordlist through espeak to get phonetics
 # improvised parallelisation - simple call because 'split' often has different versions
-split -l 10000 $dir/wlist.txt $dir/Wtemp_
-for w in $dir/Wtemp_*; do
-  (cat $w | espeak -q -vda -x > $w.pho) &
+split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_
+for w in $dictsrc/Wtemp_*; do
+    (cat $w | espeak -q -vda -x > $w.pho) &
 done
 
 wait
 
-cat $dir/Wtemp_*.pho > $dir/plist.txt
-rm -f $dir/Wtemp_*
+cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt
+rm -f $dictsrc/Wtemp_*
 
 
 # Filter transcription
-# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove 
+# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
 # initial and trailing spaces and collapse 2 or more spaces to one space
 
-cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
+cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt
 
 #Some question marks are not caught above
-perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt
+perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt
 
 # Create lexicon.txt and put it in data/local/dict
-paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt
+paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt
 
 # Remove entries without transcription
-grep -P  "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt
+grep -P  "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt
 
 # Copy pre-made phone table with
-cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
+cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt
 
 
 # Add "!SIL SIL" to lexicon.txt
-echo -e '!SIL\tSIL' > $dir/lex_first
-echo -e '<UNK>\tSPN' >> $dir/lex_first
-cat $dir/lexicon2.txt >> $dir/lex_first
-mv $dir/lex_first $dir/lexicon.txt
+echo -e '!SIL\tSIL' > $dictsrc/lex_first
+echo -e '<UNK>\tSPN' >> $dictsrc/lex_first
+cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first
+mv $dictsrc/lex_first $dictdir/lexicon.txt
 
 # silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-touch $dir/extra_questions.txt
-
-# Repeat text preparation on test set, but do not add to dictionary
-# Create dir to hold lm files and other non-standard files 
-testsrc=data/local/testsrc
-rm -rf $testsrc
-mkdir $testsrc
-mv data/test/text1 $testsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am 
-local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
-paste $testsrc/onlyids $testsrc/onlytext > data/test/text
-utils/validate_data_dir.sh --no-feat data/test || exit 1;
-
-# Repeat text preparation on dev set, but do not add to dictionary
-# Create dir to hold lm files and other non-standard files 
-devsrc=data/local/devsrc
-rm -rf $devsrc
-mkdir $devsrc
-mv data/dev/text1 $devsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
-local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
-paste $devsrc/onlyids $devsrc/onlytext > data/dev/text &
-
-# Also create a file that can be used for reranking using text features
-local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
-sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
-
-
-utils/validate_data_dir.sh --no-feat data/dev || exit 1;
 
+if [ ! -f $dictdir/silence_phones.txt ]; then
+    echo SIL > $dictdir/silence_phones.txt
+fi
 
+if [ ! -f $dictdir/optional_silence.txt ]; then
+    echo SIL > $dictdir/optional_silence.txt
+fi
 
-## TODO: add cleanup commands
+if [ ! -f $dictdir/extra_questions.txt ]; then
+    touch $dictdir/extra_questions.txt
+fi
 
-echo "Normalisation and dictionary preparation succeeded"
 
+echo "Dictionary preparation succeeded"
diff --git a/egs/sprakbanken/s5/local/norm_dk/format_text.sh b/egs/sprakbanken/s5/local/norm_dk/format_text.sh
@@ -34,8 +34,8 @@ nonum=$tmp/nonum.tmp
 
 cat $2 | tr -d '\r' > $src
 
-$dir/expand_abbr_medical.sh $src > $abbr;
-$dir/remove_annotation.sh $abbr > $rem;
+#$dir/expand_abbr_medical.sh $src > $abbr;
+$dir/remove_annotation.sh $src > $rem;
 if [ $mode != "am" ]; then
     $dir/sent_split.sh $rem > $line;
 else
@@ -45,10 +45,11 @@ fi
 $dir/expand_dates.sh $line |\
 $dir/format_punct.sh  >  $num;
 #python3 $dir/writenumbers.py $dir/numbersUp.tbl $num $nonum;
-cat $num | $dir/write_punct.sh | \
+# $dir/write_punct.sh | \
+cat $num | \
 perl -pi -e "s/^\n//" | \
-perl -pe 's/ (.{4}.*?)\./ \1/g' | \
-PERLIO=:utf8 perl -pe '$_=uc'
+perl -pe 's/ (.{4}.*?)\./ \1/g'
+# | PERLIO=:utf8 perl -pe '$_=lc'
 
 # Comment this line for debugging
 wait