diff --git a/egs/sprakbanken/s5/local/copy_dict.sh b/egs/sprakbanken/s5/local/copy_dict.sh
index c5cd1fc77b4..5ae5e9697b1 100755
--- a/egs/sprakbanken/s5/local/copy_dict.sh
+++ b/egs/sprakbanken/s5/local/copy_dict.sh
@@ -1,7 +1,8 @@
 #!/bin/bash
 
 # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
-# Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2014-15 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2016 Andreas Kirkedal
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,9 +17,7 @@
 # See the Apache 2 License for the specific language governing permissions and
 # limitations under the License.
 
-KALDI_ROOT=$(pwd)/../../..
-
-exproot=$(pwd)
+lex=lexicon-da-nonorm.tgz
 dir=data/local/dict
 mkdir -p $dir
 
@@ -31,22 +30,13 @@ cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
 cp local/dictsrc/extra_questions.txt $dir/extra_questions.txt
 
 # Copy pre-made lexicon
-wget http://www.openslr.org/resources/8/lexicon-da.tgz --directory-prefix=data/local/data/download
-tar -xzf data/local/data/download/lexicon-da.tgz -C $dir
+wget http://www.openslr.org/resources/8/$lex --directory-prefix=data/local/data/download
+tar -xzf data/local/data/download/$lex -C $dir
 
 
 # silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
+echo -e "SIL\nSPN" > $dir/silence_phones.txt
 echo SIL > $dir/optional_silence.txt
 
-
-
-
-
-wait
-
-
-## TODO: add cleanup commands
-
 echo "Dictionary preparation succeeded"
 
diff --git a/egs/sprakbanken/s5/local/create_datasets.sh b/egs/sprakbanken/s5/local/create_datasets.sh
index b0d87a730e8..891771dbce1 100755
--- a/egs/sprakbanken/s5/local/create_datasets.sh
+++ b/egs/sprakbanken/s5/local/create_datasets.sh
@@ -24,7 +24,7 @@ fi
 src=$1
 dest=$2
 mkdir $dest
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
+python local/normalize_transcript_prefixed.py local/norm_dk/numbersLow.tbl $src/text.unnormalised $src/onlyids $src/transcripts.am 
 local/norm_dk/format_text.sh am $src/transcripts.am > $src/onlytext
 paste -d ' ' $src/onlyids $src/onlytext > $dest/text
 for f in wav.scp utt2spk; do
diff --git a/egs/sprakbanken/s5/local/dict_prep.sh b/egs/sprakbanken/s5/local/dict_prep.sh
index 8ecfa028408..1e37460dbe5 100755
--- a/egs/sprakbanken/s5/local/dict_prep.sh
+++ b/egs/sprakbanken/s5/local/dict_prep.sh
@@ -2,6 +2,7 @@
 
 # Copyright 2010-2012 Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Copyright 2014 Mirsk Digital ApS  (Author: Andreas Kirkedal)
+# Copyright 2014-2016 Andreas Kirkedal5D
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,57 +20,24 @@
 KALDI_ROOT=$(pwd)/../../..
 
 exproot=$(pwd)
-dir=data/local/dict
+lmdir=data/local/transcript_lm
+dictsrc=data/local/dictsrc
+dictdir=data/local/dict
 espeakdir='espeak-1.48.04-source'
-mkdir -p $dir
+mkdir -p $dictsrc $dictdir
 
 
 # Dictionary preparation:
 
-
-# Normalise transcripts and create a transcript file
-# Removes '.,:;?' and removes '\' before '\Komma' (dictated ',') 
-# outputs a normalised transcript without utterance ids and a list of utterance ids 
-echo "Normalising"
-
-# Create dir to hold lm files and other non-standard files, useful for debugging
-trainsrc=data/local/trainsrc
-rm -rf $trainsrc
-mkdir $trainsrc
-mv data/train/text1 $trainsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $trainsrc/text1 $trainsrc/onlyids $dir/transcripts.tmp
-
-# Additional normalisation, uppercasing, writing numbers etc.
-# and recombine with 
-local/norm_dk/format_text.sh am $dir/transcripts.tmp > $dir/transcripts.am
-cp $dir/transcripts.am $trainsrc/onlytext
-paste $trainsrc/onlyids $trainsrc/onlytext > data/train/text 
-utils/validate_data_dir.sh --no-feat data/train || exit 1;
-
-
-
-# lmsents is output by sprak_data_prep.sh and contains
-# sentences that are disjoint from the test and dev set 
-python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl data/local/data/lmsents $dir/lmsents.norm
-wait
-
 # Create wordlist from the AM transcripts
-cat $dir/transcripts.am | tr [:blank:] '\n' | sort -u > $dir/wlist.txt &
-
-# Because training data is read aloud, there are many occurences of the same
-# sentence and bias towards the domain. Make a version where  
-# the sentences are unique to reduce bias.
-local/norm_dk/format_text.sh lm $dir/lmsents.norm > $dir/transcripts.txt
-sort -u $dir/transcripts.txt > $dir/transcripts.uniq
-
+cat $lmdir/transcripts.uniq | tr [:blank:] '\n' | sort -u > $dictsrc/wlist.txt &
 
 # Install eSpeak if it is not installed already
-
 if hash espeak 2>/dev/null;
-  then
+then
     echo 'eSpeak installed'
-  else
-    cd $KALDI_ROOT/tools || exit 1; 
+else
+    cd $KALDI_ROOT/tools || exit 1;
     wget http://sourceforge.net/projects/espeak/files/espeak/espeak-1.48/${espeakdir}.zip
     wait
     unzip -q $espeakdir.zip
@@ -81,87 +49,60 @@ if hash espeak 2>/dev/null;
     cd $exproot || exit 1;
 fi
 
-
-
 # Wait for the wordlist to be fully created
-wait 
-
+wait
 
 # Run wordlist through espeak to get phonetics
 # improvised parallelisation - simple call because 'split' often has different versions
-split -l 10000 $dir/wlist.txt $dir/Wtemp_
-for w in $dir/Wtemp_*; do
-  (cat $w | espeak -q -vda -x > $w.pho) &
+split -l 10000 $dictsrc/wlist.txt $dictsrc/Wtemp_
+for w in $dictsrc/Wtemp_*; do
+    (cat $w | espeak -q -vda -x > $w.pho) &
 done
 
 wait
 
-cat $dir/Wtemp_*.pho > $dir/plist.txt
-rm -f $dir/Wtemp_*
+cat $dictsrc/Wtemp_*.pho > $dictsrc/plist.txt
+rm -f $dictsrc/Wtemp_*
 
 
 # Filter transcription
-# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove 
+# Remove diacritics, language annotation ((da), (en), (fr) etc.), insert space between symbols, remove
 # initial and trailing spaces and collapse 2 or more spaces to one space
 
-cat $dir/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dir/plist2.txt
+cat $dictsrc/plist.txt | perl -pe 's/\([[a-z]{2}\)//g' | perl -pe 's// /g' | perl -pe 's/ a I / aI /g' | perl -pe 's/ d Z / dZ /g' | perl -pe 's/ \? / /g' | perl -pe 's/ ([\#]) /\+ /g' | perl -pe 's/([\@n3]) \- /\1\- /g' | perl -pe "s/[\_\:\!\'\,\|2]//g" | perl -pe 's/ \- / /g' | tr -s ' ' | perl -pe 's/^ +| +$//g' > $dictsrc/plist2.txt
 
 #Some question marks are not caught above
-perl -pe 's/ \? / /g' $dir/plist2.txt > $dir/plist3.txt
+perl -pe 's/ \? / /g' $dictsrc/plist2.txt > $dictsrc/plist3.txt
 
 # Create lexicon.txt and put it in data/local/dict
-paste $dir/wlist.txt $dir/plist3.txt > $dir/lexicon1.txt
+paste $dictsrc/wlist.txt $dictsrc/plist3.txt > $dictsrc/lexicon1.txt
 
 # Remove entries without transcription
-grep -P  "^.+\t.+$" $dir/lexicon1.txt > $dir/lexicon2.txt
+grep -P  "^.+\t.+$" $dictsrc/lexicon1.txt > $dictsrc/lexicon2.txt
 
 # Copy pre-made phone table with
-cp local/dictsrc/complexphones.txt $dir/nonsilence_phones.txt
+cp local/dictsrc/complexphones.txt $dictdir/nonsilence_phones.txt
 
 
 # Add "!SIL SIL" to lexicon.txt
-echo -e '!SIL\tSIL' > $dir/lex_first
-echo -e '<UNK>\tSPN' >> $dir/lex_first
-cat $dir/lexicon2.txt >> $dir/lex_first
-mv $dir/lex_first $dir/lexicon.txt
+echo -e '!SIL\tSIL' > $dictsrc/lex_first
+echo -e '<UNK>\tSPN' >> $dictsrc/lex_first
+cat $dictsrc/lexicon2.txt >> $dictsrc/lex_first
+mv $dictsrc/lex_first $dictdir/lexicon.txt
 
 # silence phones, one per line.
-echo SIL > $dir/silence_phones.txt
-echo SIL > $dir/optional_silence.txt
-
-touch $dir/extra_questions.txt
-
-# Repeat text preparation on test set, but do not add to dictionary
-# Create dir to hold lm files and other non-standard files 
-testsrc=data/local/testsrc
-rm -rf $testsrc
-mkdir $testsrc
-mv data/test/text1 $testsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $testsrc/text1 $testsrc/onlyids $testsrc/transcripts.am 
-local/norm_dk/format_text.sh am $testsrc/transcripts.am > $testsrc/onlytext
-paste $testsrc/onlyids $testsrc/onlytext > data/test/text
-utils/validate_data_dir.sh --no-feat data/test || exit 1;
-
-# Repeat text preparation on dev set, but do not add to dictionary
-# Create dir to hold lm files and other non-standard files 
-devsrc=data/local/devsrc
-rm -rf $devsrc
-mkdir $devsrc
-mv data/dev/text1 $devsrc/text1
-python3 local/normalize_transcript_prefixed.py local/norm_dk/numbersUp.tbl $devsrc/text1 $devsrc/onlyids $devsrc/transcripts.tmp
-local/norm_dk/format_text.sh am $devsrc/transcripts.tmp > $devsrc/onlytext
-paste $devsrc/onlyids $devsrc/onlytext > data/dev/text &
-
-# Also create a file that can be used for reranking using text features
-local/norm_dk/format_text.sh lm $devsrc/transcripts.tmp > data/dev/transcripts.txt
-sort -u data/dev/transcripts.txt > data/dev/transcripts.uniq
-
-
-utils/validate_data_dir.sh --no-feat data/dev || exit 1;
 
+if [ ! -f $dictdir/silence_phones.txt ]; then
+    echo SIL > $dictdir/silence_phones.txt
+fi
 
+if [ ! -f $dictdir/optional_silence.txt ]; then
+    echo SIL > $dictdir/optional_silence.txt
+fi
 
-## TODO: add cleanup commands
+if [ ! -f $dictdir/extra_questions.txt ]; then
+    touch $dictdir/extra_questions.txt
+fi
 
-echo "Normalisation and dictionary preparation succeeded"
 
+echo "Dictionary preparation succeeded"
diff --git a/egs/sprakbanken/s5/local/norm_dk/format_text.sh b/egs/sprakbanken/s5/local/norm_dk/format_text.sh
index ff85c8cc0ef..abbf975dbdf 100755
--- a/egs/sprakbanken/s5/local/norm_dk/format_text.sh
+++ b/egs/sprakbanken/s5/local/norm_dk/format_text.sh
@@ -34,8 +34,8 @@ nonum=$tmp/nonum.tmp
 
 cat $2 | tr -d '\r' > $src
 
-$dir/expand_abbr_medical.sh $src > $abbr;
-$dir/remove_annotation.sh $abbr > $rem;
+#$dir/expand_abbr_medical.sh $src > $abbr;
+$dir/remove_annotation.sh $src > $rem;
 if [ $mode != "am" ]; then
     $dir/sent_split.sh $rem > $line;
 else
@@ -45,10 +45,11 @@ fi
 $dir/expand_dates.sh $line |\
 $dir/format_punct.sh  >  $num;
 #python3 $dir/writenumbers.py $dir/numbersUp.tbl $num $nonum;
-cat $num | $dir/write_punct.sh | \
+# $dir/write_punct.sh | \
+cat $num | \
 perl -pi -e "s/^\n//" | \
-perl -pe 's/ (.{4}.*?)\./ \1/g' | \
-PERLIO=:utf8 perl -pe '$_=uc'
+perl -pe 's/ (.{4}.*?)\./ \1/g'
+# | PERLIO=:utf8 perl -pe '$_=lc'
 
 # Comment this line for debugging
 wait
diff --git a/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl b/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl
new file mode 100644
index 00000000000..824c0afa3b2
--- /dev/null
+++ b/egs/sprakbanken/s5/local/norm_dk/numbersLow.tbl
@@ -0,0 +1,265 @@
+¼	en fjerdedel
+½	en halv
+0	nul
+²	i anden
+enogfirs	en og firs
+enogfyrre	en og fyrre
+enoghalvfems	en og halvfems
+enoghalvfjerds	en og halvfjerds
+enoghalvtreds	en og halvtreds
+enogtredive	en og tredive
+enogtredivte	en og tredivte
+enogtres	en og tres
+enogtyvende	en og tyvende
+femogfirs	fem og firs
+femogfyrre	fem og fyrre
+femoghalvfems	fem og halvfems
+femoghalvfjerds	fem og halvfjerds
+femoghalvtreds	fem og halvtreds
+femogtredive	fem og tredive
+femogtres	fem og tres
+femogtyve	fem og tyve
+femogtyvende	fem og tyvende
+fireogfirs	fire og firs
+fireogfyrre	fire og fyrre
+fireoghalvfems	fire og halvfems
+fireoghalvfjerds	fire og halvfjerds
+fireoghalvtreds	fire og halvtreds
+fireogtredive	fire og tredive
+fireogtres	fire og tres
+fireogtyve	fire og tyve
+fireogtyvende	fire og tyvende
+fyrreogtyvende	fyrre og tyvende
+niogfirs	ni og firs
+niogfyrre	ni og fyrre
+nioghalvfems	ni og halvfems
+nioghalvfjerds	ni og halvfjerds
+nioghalvtreds	ni og halvtreds
+niogtredive	ni og tredive
+niogtres	ni og tres
+niogtyvende	ni og tyvende
+niogtyve	ni og tyve
+otteogfirs	otte og firs
+otteogfyrre	otte og fyrre
+otteoghalvfems	otte og halvfems
+otteoghalvfjerds	otte og halvfjerds
+otteoghalvtreds	otte og halvtreds
+otteogtredive	otte og tredive
+otteogtres	otte og tres
+otteogtyvende	otte og tyvende
+otteogtyve	otte og tyve
+seksogfirs	seks og firs
+seksogfyrre	seks og fyrre
+seksoghalvfems	seks og halvfems
+seksoghalvfjerds	seks og halvfjerds
+seksoghalvtreds	seks og halvtreds
+seksogtredive	seks og tredive
+seksogtres	seks og tres
+seksogtyvende	seks og tyvende
+seksogtyve	seks og tyve
+syvogfirs	syv og firs
+syvogfyrre	syv og fyrre
+syvoghalvfems	syv og halvfems
+syvoghalvfjerds	syv og halvfjerds
+syvoghalvtreds	syv og halvtreds
+syvogtredive	syv og tredive
+syvogtres	syv og tres
+syvogtyvende	syv og tyvende
+syvogtyve	syv og tyve
+toogfirs	to og firs
+toogfyrre	to og fyrre
+tooghalvfems	to og halvfems
+tooghalvfjerds	to og halvfjerds
+tooghalvtreds	to og halvtreds
+toogtredive	to og tredive
+toogtres	to og tres
+toogtyvende	to og tyvende
+toogtyve	to og tyve
+totusindogatten	to tusind og atten
+totusindogelleve	to tusind og elleve
+totusindoget	to tusind og et
+totusindogfemten	to tusind og femten
+totusindogfem	to tusind og fem
+totusindogfire	to tusind og fire
+totusindogfjorten	to tusind og fjorten
+totusindogni	to tusind og ni
+totusindognitten	to tusind og nitten
+totusindogotte	to tusind og otte
+totusindogseksten	to tusind og seksten
+totusindogseks	to tusind og seks
+totusindogsytten	to tusind og sytten
+totusindogsyv	to tusind og syv
+totusindogti	to tusind og ti
+totusindogtolv	to tusind og tolv
+totusindogto	to tusind og to
+totusindogtre	to tusind og tre
+totusindogtretten	to tusind og tretten
+totusindogtyve	to tusind og tyve
+treogfirs	tre og firs
+treogfyrre	tre og fyrre
+treoghalvfems	tre og halvfems
+treoghalvfjerds	tre og halvfjerds
+treoghalvtreds	tre og halvtreds
+treogtredive	tre og tredive
+treogtres	tre og tres
+treogtyvende	tre og tyvende
+treogtyve	tre og tyve
+1	en
+1.	første
+2.	anden
+2	to
+3	tre
+3.	tredje
+4	fire
+4.	fjerde
+5	fem
+5.	femte
+6	seks
+6.	sjette
+7	syv
+7.	syvende
+8	otte
+8.	ottende
+9	ni
+9.	niende
+10	ti
+10.	tiende
+11	elleve
+11.	ellevte
+12	tolv
+12.	tolvte
+13	tretten
+13.	trettende
+14	fjorten
+14.	fjortende
+15	femten
+15.	femtende
+16	seksten
+16.	sekstende
+17	sytten
+17.	syttende
+18	atten
+18.	attende
+19	nitten
+19.	nittende
+20	tyve
+20.	tyvende
+21	en og tyve
+21.	en og tyvende
+22	to og tyve
+22.	to og tyvende
+23	tre og tyve
+23.	tre og tyvende
+24	fire og tyve
+24.	fire og tyvende
+25	fem og tyve
+25.	fem og tyvende
+26	seks og tyve
+26.	seks og tyvende
+27	syv og tyve
+27.	syv og tyvende
+28	otte og tyve
+28.	otte og tyvende
+29	ni og tyve
+29.	ni og tyvende
+30	tredive
+30.	tredivte
+31	en og tredive
+31.	en og tredivte
+32	to og tredive
+33	tre og tredive
+34	fire og tredive
+35	fem og tredive
+36	seks og tredive
+37	syv og tredive
+38	otte og tredive
+39	ni og tredive
+40	fyrre
+40.	fyrre og tyvende
+41	en og fyrre
+42	to og fyrre
+43	tre og fyrre
+44	fire og fyrre
+45	fem og fyrre
+46	seks og fyrre
+47	syv og fyrre
+48	otte og fyrre
+49	ni og fyrre
+50	halvtreds
+50.	halvtredsinds tyvende
+51	en og halvtreds
+52	to og halvtreds
+53	tre og halvtreds
+54	fire og halvtreds
+55	fem og halvtreds
+56	seks og halvtreds
+57	syv og halvtreds
+58	otte og halvtreds
+59	ni og halvtreds
+60	tres
+60.	tresinds tyvende
+61	en og tres
+62	to og tres
+63	tre og tres
+64	fire og tres
+65	fem og tres
+66	seks og tres
+67	syv og tres
+68	otte og tres
+69	ni og tres
+70	halvfjerds
+70.	halvfjerdsinds tyvende
+71	en og halvfjerds
+72	to og halvfjerds
+73	tre og halvfjerds
+74	fire og halvfjerds
+75	fem og halvfjerds
+76	seks og halvfjerds
+77	syv og halvfjerds
+78	otte og halvfjerds
+79	ni og halvfjerds
+80	firs
+80.	firsindstyvende
+81	en og firs
+82	to og firs
+83	tre og firs
+84	fire og firs
+85	fem og firs
+86	seks og firs
+87	syv og firs
+88	otte og firs
+89	ni og firs
+90	halvfems
+90.	halvfemsinds tyvende enogtyve	en og tyve
+91	en og halvfems
+92	to og halvfems
+93	tre og halvfems
+94	fire og halvfems
+95	fem og halvfems
+96	seks og halvfems
+97	syv og halvfems
+98	otte og halvfems
+99	ni og halvfems
+100	hundrede
+1000	tusind
+2000	to tusind
+2001	to tusind og et
+2002	to tusind og to
+2003	to tusind og tre
+2004	to tusind og fire
+2005	to tusind og fem
+2006	to tusind og seks
+2007	to tusind og syv
+2008	to tusind og otte
+2009	to tusind og ni
+2010	to tusind og ti
+2011	to tusind og elleve
+2012	to tusind og tolv
+2013	to tusind og tretten
+2014	to tusind og fjorten
+2015	to tusind og femten
+2016	to tusind og seksten
+2017	to tusind og sytten
+2018	to tusind og atten
+2019	to tusind og nitten
+2020	to tusind og tyve
diff --git a/egs/sprakbanken/s5/local/normalize_transcript.py b/egs/sprakbanken/s5/local/normalize_transcript.py
index f759a39731d..2374418bee7 100755
--- a/egs/sprakbanken/s5/local/normalize_transcript.py
+++ b/egs/sprakbanken/s5/local/normalize_transcript.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 import codecs
 import sys
 import re
 import writenumbers
-
+from string import maketrans
 
 ## Global vars
 
@@ -16,7 +17,10 @@
             "\t": " "
             }
 
-t_table = str.maketrans(normdict)
+from_chars = ''.join(normdict.keys())
+to_chars = ''.join(normdict.values())
+
+#t_table = maketrans(from_chars, to_chars)
 
 
 ## Main
@@ -27,10 +31,11 @@
 
 
 for line in transcript:
-    normtext1 = line.translate(t_table)
-    normtext2 = re.sub(r'  +', ' ', normtext1.strip())
-    normtext3 = writenumbers.normNumber(normtext2, numtable)
-    outtext.write(normtext3.upper() + "\n")
+    normtext1 = re.sub(r'[\.,:;\?]', '', line)
+    normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
+    normtext3 = re.sub(r'  +', ' ', normtext2.strip())
+    normtext4 = writenumbers.normNumber(normtext3, numtable)
+    outtext.write(normtext4)
 
 transcript.close()
 outtext.close()
diff --git a/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py b/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py
index e934533a393..557606ae205 100755
--- a/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py
+++ b/egs/sprakbanken/s5/local/normalize_transcript_prefixed.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 '''
 # Copyright 2013-2014 Mirsk Digital Aps  (Author: Andreas Kirkedal)
+# Copyright 2014-2016 Andreas Kirkedal
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,16 +25,16 @@
 
 ## Global vars
 
-normdict = {",": " ",
-            ":": " ",
-            ";": " ",
-            "?": " ",
-            "\\": " ",
-            "\t": " ",
-            #".": ""
-            }
+# normdict = {",": " ",
+#             ":": " ",
+#             ";": " ",
+#             "?": " ",
+#             "\\": " ",
+#             "\t": " ",
+#             #".": ""
+#             }
 
-t_table = str.maketrans(normdict)
+# t_table = str.maketrans(normdict)
 
 
 ## Utility function
@@ -51,12 +52,13 @@ def getuttid_text(line):
 
 for line in textin:
         utt_id, text = getuttid_text(line)
-        normtext1 = text.translate(t_table)
-        normtext2 = re.sub(r'  +', ' ', normtext1.strip())
-        normtext3 = writenumbers.normNumber(normtext2, numtable)
-
+        normtext1 = re.sub(r'[\.,:;\?]', '', text)
+        normtext2 = re.sub(r'[\t\\]', ' ', normtext1)
+        normtext3 = re.sub(r'  +', ' ', normtext2.strip())
+        normtext4 = writenumbers.normNumber(normtext3, numtable)
+        outtext.write(normtext4)
         fid.write(utt_id + "\n")
-        outtext.write(normtext3)
+
 
 textin.close()
 outtext.close()
diff --git a/egs/sprakbanken/s5/local/score.sh b/egs/sprakbanken/s5/local/score.sh
index abd8149a672..9fcafdc0b5c 100755
--- a/egs/sprakbanken/s5/local/score.sh
+++ b/egs/sprakbanken/s5/local/score.sh
@@ -1,18 +1,24 @@
 #!/bin/bash
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey)
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
 # Apache 2.0
 
+# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER
+
 [ -f ./path.sh ] && . ./path.sh
 
 # begin configuration section.
 cmd=run.pl
 stage=0
-decode_mbr=true
-word_ins_penalty=0.0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
 min_lmwt=7
 max_lmwt=17
+iter=final
 #end configuration section.
 
+echo "$0 $@"  # Print the command line for logging
 [ -f ./path.sh ] && . ./path.sh
 . parse_options.sh || exit 1;
 
@@ -37,21 +43,107 @@ for f in $symtab $dir/lat.1.gz $data/text; do
   [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
 done
 
-mkdir -p $dir/scoring/log
 
-cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      ark:$dir/scoring_kaldi/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
 
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.log \
-  lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
-  lattice-add-penalty --word-ins-penalty=$word_ins_penalty ark:- ark:- \| \
-  lattice-best-path --word-symbol-table=$symtab \
-    ark:- ark,t:$dir/scoring/LMWT.tra || exit 1;
 
-# Note: the double level of quoting for the sed command
-$cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.log \
-   cat $dir/scoring/LMWT.tra \| \
-    utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
-    compute-wer --text --mode=present \
-     ark:$dir/scoring/test_filt.txt  ark,p:- ">&" $dir/wer_LMWT || exit 1;
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
 
 exit 0;
diff --git a/egs/sprakbanken/s5/local/sprak_data_prep.sh b/egs/sprakbanken/s5/local/sprak_data_prep.sh
index c7a1d048a4f..1b2406620f2 100755
--- a/egs/sprakbanken/s5/local/sprak_data_prep.sh
+++ b/egs/sprakbanken/s5/local/sprak_data_prep.sh
@@ -2,6 +2,7 @@
 
 # Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
 # Copyright 2013-2014  Mirsk Digital Aps (Author: Andreas Kirkedal)
+# Copyright 2015-2016  Andreas Kirkedal
 # Apache 2.0.
 
 
@@ -21,12 +22,12 @@ utils=`pwd`/utils
 # This recipe currently relies on version 3 because python3 uses utf8 as internal 
 # string representation
 
-if ! which python3 >&/dev/null; then
-  echo "Installing python3 since not on your path."
-  pushd $KALDI_ROOT/tools || exit 1;
-  extras/install_python3.sh || exit 1;
-  popd
-fi
+#if ! which python3 >&/dev/null; then
+#  echo "Installing python3 since not on your path."
+#  pushd $KALDI_ROOT/tools || exit 1;
+#  extras/install_python3.sh || exit 1;
+#  popd
+#fi
 
 if [ ! -d $dir/download ]; then
     mkdir -p $dir/download/0565-1 $dir/download/0565-2
@@ -35,15 +36,15 @@ fi
 echo "Downloading and unpacking sprakbanken to $dir/corpus_processed. This will take a while."
 
 if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then 
-    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download ) &
+    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-1.tar.gz --directory-prefix=$dir/download )
 fi
 
 if [ ! -f $dir/download/da.16kHz.0565-2.tar.gz ]; then 
-    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download ) &
+    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0565-2.tar.gz --directory-prefix=$dir/download )
 fi
 
-if [ ! -f $dir/download/da.16kHz.0565-1.tar.gz ]; then 
-    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download ) &
+if [ ! -f $dir/download/da.16kHz.0611.tar.gz ]; then 
+    ( wget http://www.nb.no/sbfil/talegjenkjenning/16kHz/da.16kHz.0611.tar.gz --directory-prefix=$dir/download )
 fi    
 wait
 
@@ -51,8 +52,8 @@ echo "Corpus files downloaded."
 
 if [ ! -d $dir/download/0611 ]; then
     echo "Unpacking files."
-    tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1 &
-    tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 &
+    tar -xzf $dir/download/da.16kHz.0565-1.tar.gz -C $dir/download/0565-1
+    tar -xzf $dir/download/da.16kHz.0565-2.tar.gz -C $dir/download/0565-2 
     tar -xzf $dir/download/da.16kHz.0611.tar.gz -C $dir/download    
 
     # Note: rename "da 0611 test" to "da_0611_test" for this to work
@@ -62,7 +63,7 @@ if [ ! -d $dir/download/0611 ]; then
 fi
 
 
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+sph2pipe=$(which sph2pipe) || sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 if [ ! -x $sph2pipe ]; then
    echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
    exit 1;
@@ -78,27 +79,25 @@ mkdir -p $dir/corpus_processed/training/0565-1 $dir/corpus_processed/training/05
 # Create parallel file lists and text files, but keep sound files in the same location to save disk space
 # Writes the lists to data/local/data (~ 310h)
 echo "Creating parallel data for training data."
-python3 $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1 &  # ~130h
-python3 $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2 &  # ~115h
-python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05 & # ~51h 
+python $local/sprak2kaldi.py $dir/download/0565-1 $dir/corpus_processed/training/0565-1   # ~130h
+python $local/sprak2kaldi.py $dir/download/0565-2 $dir/corpus_processed/training/0565-2   # ~115h
+python $local/sprak2kaldi.py $dir/download/0611/Stasjon05 $dir/corpus_processed/training/0611_Stasjon05  # ~51h 
 
 (
 # Ditto dev set (~ 16h)
     echo "Creating parallel data for test data."
     rm -rf $dir/corpus_processed/dev03 
     mkdir -p $dir/corpus_processed/dev03 
-    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 &
-) &
+    python $local/sprak2kaldi.py $dir/download/0611/Stasjon03 $dir/corpus_processed/dev03 || exit 1;
+)
 
 (
 # Ditto test set (about 9 hours)
     echo "Creating parallel data for development data."
     rm -rf $dir/corpus_processed/test06 
     mkdir -p $dir/corpus_processed/test06 
-    python3 $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
-) &
-
-wait
+    python $local/sprak2kaldi.py $dir/download/0611/Stasjon06 $dir/corpus_processed/test06 || exit 1;
+)
 
 # Create the LM training data 
 # Test and dev data is disjoint from training data, so we use those transcripts)
@@ -110,10 +109,10 @@ wait
 (
     echo "Writing the LM text to file and normalising."
     cat $dir/corpus_processed/training/0565-1/txtlist $dir/corpus_processed/training/0565-2/txtlist | while read l; do cat $l; done > $lmdir/lmsents
-    python3 local/normalize_transcript.py local/norm_dk/numbersUp.tbl $lmdir/lmsents $lmdir/lmsents.norm
+    python local/normalize_transcript.py local/norm_dk/numbersLow.tbl $lmdir/lmsents $lmdir/lmsents.norm
     local/norm_dk/format_text.sh lm $lmdir/lmsents.norm > $lmdir/transcripts.txt
     sort -u $lmdir/transcripts.txt > $lmdir/transcripts.uniq
-) &
+)
 
 # Combine training file lists
 echo "Combine file lists."
@@ -131,18 +130,15 @@ cp $dir/corpus_processed/test06/sndlist $dir/testsndfiles
 # Write wav.scp, utt2spk and text.unnormalised for train, test and dev sets with
 # Use sph2pipe because the wav files are actually sph files
 echo "Creating wav.scp, utt2spk and text.unnormalised for train, test and dev" 
-python3 $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe &
-python3 $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe &
-python3 $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe &
+python $local/data_prep.py $dir/traintxtfiles $traindir $dir/trainsndfiles $sph2pipe 
+python $local/data_prep.py $dir/testtxtfiles $testdir $dir/testsndfiles $sph2pipe 
+python $local/data_prep.py $dir/devtxtfiles $devdir $dir/devsndfiles $sph2pipe 
 
-wait
 
 # Create the main data sets
-local/create_datasets.sh $testdir data/test &
-local/create_datasets.sh $devdir data/dev &
-local/create_datasets.sh $traindir data/train &
-
-wait
+local/create_datasets.sh $testdir data/test 
+local/create_datasets.sh $devdir data/dev 
+local/create_datasets.sh $traindir data/train 
 
 ## TODO
 
diff --git a/egs/sprakbanken/s5/local/wer_hyp_filter b/egs/sprakbanken/s5/local/wer_hyp_filter
new file mode 100755
index 00000000000..8ecbdd3ec04
--- /dev/null
+++ b/egs/sprakbanken/s5/local/wer_hyp_filter
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+perl -C -pe 's:<NSN>::g; s:<SPN>::g; s:<UNK>::g' | \
+perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \
+PERLIO=:utf8 perl -pe '$_=lc'
diff --git a/egs/sprakbanken/s5/local/wer_output_filter b/egs/sprakbanken/s5/local/wer_output_filter
new file mode 100755
index 00000000000..8ecbdd3ec04
--- /dev/null
+++ b/egs/sprakbanken/s5/local/wer_output_filter
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+perl -C -pe 's:<NSN>::g; s:<SPN>::g; s:<UNK>::g' | \
+perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \
+PERLIO=:utf8 perl -pe '$_=lc'
diff --git a/egs/sprakbanken/s5/local/wer_ref_filter b/egs/sprakbanken/s5/local/wer_ref_filter
new file mode 100755
index 00000000000..8ecbdd3ec04
--- /dev/null
+++ b/egs/sprakbanken/s5/local/wer_ref_filter
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+perl -C -pe 's:<NSN>::g; s:<SPN>::g; s:<UNK>::g' | \
+perl -pe 's/é|è|ë/e/g; s/á|à|ä/a/g; s/ó|ò|ö/o/g; s/ú|ù|ü/u/g; s/É|È|Ë/E/g; s/Ó|Ò|Ö/O/g;' | \
+PERLIO=:utf8 perl -pe '$_=lc'
diff --git a/egs/sprakbanken/s5/local/writenumbers.py b/egs/sprakbanken/s5/local/writenumbers.py
index 452cd3e7e9c..df3235243d4 100755
--- a/egs/sprakbanken/s5/local/writenumbers.py
+++ b/egs/sprakbanken/s5/local/writenumbers.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 '''
 # Copyright 2014 Author: Andreas Kirkedal
 
diff --git a/egs/sprakbanken/s5/run.sh b/egs/sprakbanken/s5/run.sh
index 34c1f18d964..53fd7b1484e 100755
--- a/egs/sprakbanken/s5/run.sh
+++ b/egs/sprakbanken/s5/run.sh
@@ -4,198 +4,139 @@
            ## This relates to the queue.
 . ./path.sh # so python3 is on the path if not on the system (we made a link to utils/).a
 
-# This is a shell script, but it's recommended that you run the commands one by
-# one by copying and pasting into the shell.
-
-
-# Download the corpus and prepare parallel lists of sound files and text files
-# Divide the corpus into train, dev and test sets
-local/sprak_data_prep.sh  || exit 1;
-
-# Perform text normalisation, prepare dict folder and LM data transcriptions
-# This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
-#local/dict_prep.sh || exit 1;
-local/copy_dict.sh || exit 1;
-
-
-utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang_tmp data/lang || exit 1;
-
-# Now make MFCC features.
-# mfccdir should be some place with a largish disk where you
-# want to store MFCC features.
-mfccdir=mfcc
-
-
-# Extract mfccs 
-# p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some 
-# wave files are corrupt 
-# Will return a warning message because of the corrupt audio files, but compute them anyway
-# If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
-
-steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/test exp/make_mfcc/test mfcc &
-steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/dev exp/make_mfcc/dev mfcc &
-steps/make_mfcc.sh --nj 10 --cmd $train_cmd data/train exp/make_mfcc/train mfcc || exit 1;
-wait
-
-# Compute cepstral mean and variance normalisation
-steps/compute_cmvn_stats.sh data/test exp/make_mfcc/test mfcc &
-steps/compute_cmvn_stats.sh data/dev exp/make_mfcc/dev mfcc &
-steps/compute_cmvn_stats.sh data/train exp/make_mfcc/train mfcc 
-
-wait
-
-# Repair data set (remove corrupt data points with corrupt audio)
-
-utils/fix_data_dir.sh data/test &
-utils/fix_data_dir.sh data/dev &
-utils/fix_data_dir.sh data/train 
-wait
-
-# Train LM with CMUCLMTK
-# This setup uses IRSTLM
-#local/sprak_train_lm.sh &> data/local/cmuclmtk/lm.log
-
-# Train LM with irstlm
-local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "3g" data/lang data/local/train3_lm &> data/local/3g.log &
-local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "4g" data/lang data/local/train4_lm &> data/local/4g.log 
-
-# Make subset with 1k utterances for rapid testing
-# Randomly selects 980 utterances from 7 speakers
-utils/subset_data_dir.sh --per-spk data/test 140 data/test1k &
-
-# Now make subset of the training data with the shortest 120k utterances. 
-utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
-
-# Train monophone model on short utterances
-steps/train_mono.sh --nj 30 --cmd "$train_cmd" \
-  data/train_120kshort data/lang exp/mono0a || exit 1;
-
-# Ensure that LMs are created
-wait
-
-utils/mkgraph.sh data/lang_test_3g exp/mono0a exp/mono0a/graph_3g &
-utils/mkgraph.sh data/lang_test_4g exp/mono0a exp/mono0a/graph_4g &
-
-# Ensure that all graphs are constructed
-wait 
-
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-      exp/mono0a/graph_3g data/test1k exp/mono0a/decode_3g_test1k
-
-# steps/align_si.sh --boost-silence 1.25 --nj 42 --cmd "$train_cmd" \
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-   data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
-
-# steps/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \
-steps/train_deltas.sh --cmd "$train_cmd" \
-    2000 10000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
-
-wait
-
-
-utils/mkgraph.sh data/lang_test_3g exp/tri1 exp/tri1/graph_3g &
-utils/mkgraph.sh data/lang_test_4g exp/tri1 exp/tri1/graph_4g || exit 1;
- 
-(
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri1/graph_4g data/test1k exp/tri1/decode_4g_test1k || exit 1;
-) &
-
-(
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri1/graph_3g data/test1k exp/tri1/decode_3g_test1k || exit 1;
-) &
-
-wait
-
-steps/align_si.sh --nj 30 --cmd "$train_cmd" \
-  data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
-
-
-# Train tri2a, which is deltas + delta-deltas.
-steps/train_deltas.sh --cmd "$train_cmd" \
-  2500 15000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
-
-utils/mkgraph.sh data/lang_test_3g exp/tri2a exp/tri2a/graph_3g || exit 1;
-
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri2a/graph_3g data/test1k exp/tri2a/decode_3g_test1k || exit 1;
-
-
-steps/train_lda_mllt.sh --cmd "$train_cmd" \
-   --splice-opts "--left-context=5 --right-context=5" \
-   2500 15000 data/train data/lang exp/tri1_ali exp/tri2b || exit 1;
-
-utils/mkgraph.sh data/lang_test_3g exp/tri2b exp/tri2b/graph_3g || exit 1;
-steps/decode.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri2b/graph_3g data/test1k exp/tri2b/decode_3g_test1k || exit 1;
-
-
-steps/align_si.sh  --nj 30 --cmd "$train_cmd" \
-  --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
-
-wait
-
-
-# From 2b system, train 3b which is LDA + MLLT + SAT.
-steps/train_sat.sh --cmd "$train_cmd" \
-  2500 15000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
-utils/mkgraph.sh data/lang_test_3g exp/tri3b exp/tri3b/graph_3g || exit 1;
-steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
-  exp/tri3b/graph_3g data/test1k exp/tri3b/decode_3g_test1k || exit 1;
-
-
-# Trying 4-gram language model
-utils/mkgraph.sh data/lang_test_4g exp/tri3b exp/tri3b/graph_4g || exit 1;
-
-steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 \
-  exp/tri3b/graph_4g data/test1k exp/tri3b/decode_4g_test1k || exit 1;
-
-# This is commented out for now as it's not important for the main recipe.
-## Train RNN for reranking
-#local/sprak_train_rnnlms.sh data/local/dict data/dev/transcripts.uniq data/local/rnnlms/g_c380_d1k_h100_v130k
-## Consumes a lot of memory! Do not run in parallel
-#local/sprak_run_rnnlms_tri3b.sh data/lang_test_3g data/local/rnnlms/g_c380_d1k_h100_v130k data/test1k exp/tri3b/decode_3g_test1k
-
-
-# From 3b system
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
+nj=12
+
+stage=0
+. utils/parse_options.sh
+
+if [ $stage -le 0 ]; then
+  # Download the corpus and prepare parallel lists of sound files and text files
+  # Divide the corpus into train, dev and test sets
+  local/sprak_data_prep.sh  || exit 1;
+fi
+
+if [ $stage -le 1 ]; then
+  # Perform text normalisation, prepare dict folder and LM data transcriptions
+  # This setup uses previsously prepared data. eSpeak must be installed and in PATH to use dict_prep.sh
+  # local/dict_prep.sh || exit 1;
+  local/copy_dict.sh || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  utils/prepare_lang.sh data/local/dict "<UNK>" data/local/lang_tmp data/lang || exit 1;
+fi
+
+if [ $stage -le 3 ]; then
+  # Extract mfccs 
+  # p was added to the rspecifier (scp,p:$logdir/wav.JOB.scp) in make_mfcc.sh because some 
+  # wave files are corrupt 
+  # Will return a warning message because of the corrupt audio files, but compute them anyway
+  # If this step fails and prints a partial diff, rerun from sprak_data_prep.sh
+  for dataset in train test dev; do
+    steps/make_mfcc.sh --nj $nj --cmd "$train_cmd" data/$dataset || exit 1;
+
+    # Compute cepstral mean and variance normalisation
+    steps/compute_cmvn_stats.sh data/$dataset || exit 1;
+
+    # Repair data set (remove corrupt data points with corrupt audio)
+    utils/fix_data_dir.sh data/$dataset || exit 1;
+
+  done
+  # Make a subset of the training data with the shortest 120k utterances. 
+  utils/subset_data_dir.sh --shortest data/train 120000 data/train_120kshort || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # Train LM with irstlm
+  local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 3 "tg" data/lang data/local/train3_lm &> data/local/tg.log || exit 1;
+  local/train_irstlm.sh data/local/transcript_lm/transcripts.uniq 4 "fg" data/lang data/local/train4_lm &> data/local/fg.log || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  # Train monophone model on short utterances
+  steps/train_mono.sh --nj $nj --cmd "$train_cmd" \
+    data/train_120kshort data/lang exp/mono0a || exit 1;
+  utils/mkgraph.sh --mono data/lang_test_tg exp/mono0a exp/mono0a/graph_tg || exit 1;
+  steps/decode.sh --nj 12 --cmd "$decode_cmd" \
+    exp/mono0a/graph_tg data/dev exp/mono0a/decode_tg_dev || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # Train tri1 (delta+delta-delta)
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/mono0a exp/mono0a_ali || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    3000 40000 data/train data/lang exp/mono0a_ali exp/tri1 || exit 1;
+
+  # Decode dev set with both LMs
+  utils/mkgraph.sh data/lang_test_tg exp/tri1 exp/tri1/graph_tg || exit 1;
+  utils/mkgraph.sh data/lang_test_fg exp/tri1 exp/tri1/graph_fg || exit 1; 
+  steps/decode.sh --nj 12 --cmd "$decode_cmd" \
+    exp/tri1/graph_fg data/dev exp/tri1/decode_fg_dev || exit 1;
+  steps/decode.sh --nj 12 --cmd "$decode_cmd" \
+    exp/tri1/graph_tg data/dev exp/tri1/decode_tg_dev || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  # Train tri2a (delta + delta-delta)
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri1 exp/tri1_ali || exit 1;
+  steps/train_deltas.sh --cmd "$train_cmd" \
+    5000 60000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1;
+  utils/mkgraph.sh data/lang_test_tg exp/tri2a exp/tri2a/graph_tg || exit 1;
+  steps/decode.sh --nj 12 --cmd "$decode_cmd" \
+    exp/tri2a/graph_tg data/dev exp/tri2a/decode_tg_dev || exit 1;
+fi
+
+if [ $stage -le 8 ]; then
+  # Train tri2b (LDA+MLLT)
+  steps/align_si.sh --nj $nj --cmd "$train_cmd" \
+    data/train data/lang exp/tri2a exp/tri2a_ali || exit 1;
+  steps/train_lda_mllt.sh --cmd "$train_cmd" \
+    --splice-opts "--left-context=5 --right-context=5" \
+    6500 75000 data/train data/lang exp/tri2a_ali exp/tri2b || exit 1;
+  utils/mkgraph.sh data/lang_test_tg exp/tri2b exp/tri2b/graph_tg || exit 1;
+  steps/decode.sh --nj 12 --cmd "$decode_cmd" \
+    exp/tri2b/graph_tg data/dev exp/tri2b/decode_tg_dev || exit 1;
+fi
+
+if [ $stage -le 9 ]; then
+  # From 2b system, train 3b which is LDA + MLLT + SAT.
+  steps/align_si.sh  --nj $nj --cmd "$train_cmd" \
+    --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali  || exit 1;
+  steps/train_sat.sh --cmd "$train_cmd" \
+    7500 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1;
+
+  # Decode dev with 4gram and 3gram LMs
+  utils/mkgraph.sh data/lang_test_tg exp/tri3b exp/tri3b/graph_tg || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \
+    exp/tri3b/graph_tg data/dev exp/tri3b/decode_tg_dev || exit 1;
+  utils/mkgraph.sh data/lang_test_fg exp/tri3b exp/tri3b/graph_fg || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 12 \
+    exp/tri3b/graph_fg data/dev exp/tri3b/decode_fg_dev || exit 1;
+
+  # Decode test with 4gram and 3gram LMs
+  # there are fewer speaker (n=7) and decoding usually ends up waiting
+  # for a single job so we use --num-threads 2 to speed up
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \
+    exp/tri3b/graph_tg data/test exp/tri3b/decode_tg_test || exit 1;
+  steps/decode_fmllr.sh --cmd "$decode_cmd" --nj 7 --num-threads 2 \
+    exp/tri3b/graph_fg data/test exp/tri3b/decode_fg_test || exit 1;
+fi
+
+if [ $stage -le 10 ]; then
+# Alignment used to train nnets and sgmms
+steps/align_fmllr.sh --nj $nj --cmd "$train_cmd" \
   data/train data/lang exp/tri3b exp/tri3b_ali || exit 1;
+fi
 
-# From 3b system, train another SAT system (tri4a) with all the si284 data.
-
-steps/train_sat.sh  --cmd "$train_cmd" \
-  4200 40000 data/train data/lang exp/tri3b_ali exp/tri4a || exit 1;
-
-utils/mkgraph.sh data/lang_test_3g exp/tri4a exp/tri4a/graph_3g || exit 1;
-steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
-   exp/tri4a/graph_3g data/test1k exp/tri4a/decode_3g_test1k || exit 1;
-
-
-steps/train_quick.sh --cmd "$train_cmd" \
-   4200 40000 data/train data/lang exp/tri3b_ali exp/tri4b || exit 1;
-
-(
- utils/mkgraph.sh data/lang_test_3g exp/tri4b exp/tri4b/graph_3g || exit 1;
- steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
-   exp/tri4b/graph_3g data/test1k exp/tri4b/decode_3g_test1k || exit 1;
-) &
-
- utils/mkgraph.sh data/lang_test_4g exp/tri4b exp/tri4b/graph_4g || exit 1;
- steps/decode_fmllr.sh --nj 7 --cmd "$decode_cmd" \
-   exp/tri4b/graph_4g data/test1k exp/tri4b/decode_4g_test1k || exit 1;
-
-wait
-
-# alignment used to train nnets and sgmms
-steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
-  data/train data/lang exp/tri4b exp/tri4b_ali || exit 1;
+##TODO: Add nnet3 and chain setups
 
 ## Works
-local/sprak_run_nnet_cpu.sh 3g test1k 
+#local/sprak_run_nnet_cpu.sh tg dev 
 
 ## Works
-local/sprak_run_sgmm2.sh test1k
+#local/sprak_run_sgmm2.sh dev
 
 
 # Getting results [see RESULTS file]