kaldi-asr · danpovey · Apr 13, 2018 · Jul 21, 2017 · Jul 21, 2017 · Jul 21, 2017
diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh
@@ -43,13 +43,15 @@ xml=data/local/xml
 ascii=data/local/ascii
 bcorpus=data/local/browncorpus
 lobcorpus=data/local/lobcorpus
+wcorpus=data/local/wellingtoncorpus
 data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask
 lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz
 xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz
 data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip
 ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz
 brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt
 lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip
+wellington_corpus_loc=/export/corpora5/Wellington/WWC/
 mkdir -p $download_dir data/local
 
 # download and extact images and transcription
@@ -124,6 +126,38 @@ else
   echo "$0: Done downloading the Brown text corpus"
 fi
 
+if [ -d $wcorpus ]; then
+  echo "$0: Not copying Wellington corpus as it is already there."
+else
+  mkdir -p $wcorpus
+  cp -r $wellington_corpus_loc/. $wcorpus
+
+  # Combine Wellington corpora and replace some of their annotations
+  cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \
+    cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt
+
+  cat data/local/wellingtoncorpus/Wellington_annotated.txt | python3 <(
+  cat << EOF
+import sys, io, re;
+from collections import OrderedDict;
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8");
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8");
+dict=OrderedDict([("^",""), ("|",""), ("_",""), ("*0",""), ("*1",""), ("*2",""), ("*3",""), ("*4",""),
+  ("*5",""), ("*6",""), ("*7",""), ("*8",""), ("*9",""), ("*@","°"), ("**=",""), ("*=",""),
+  ("*+$",""), ("$",""), ("*+","£"), ("*-","-"), ("*/","*"), ("*|",""), ("*{","{"), ("*}","}"),
+  ("**#",""), ("*#",""), ("*?",""), ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"),
+  ("*<",""), ("*>",""), ("**[",""), ("**]",""), ("**;",""), ("*;",""), ("**:",""), ("*:",""),
+  ("\\\0",""), ("\\\15",""), ("\\\1",""), ("\\\2",""), ("\\\3",""), ("\\\6",""), ("\\\",""),
+  ("{0",""), ("{15",""), ("{1",""), ("{2",""), ("{3",""), ("{6","")]);
+pattern = re.compile("|".join(re.escape(key) for key in dict.keys()) + "|[^\\*]\\}");
+dict["}"]="";
+[sys.stdout.write(pattern.sub(lambda x: dict[x.group()[1:]] if re.match('[^\\*]\\}', x.group()) else dict[x.group()], line)) for line in sys.stdin];
+EOF
+) > data/local/wellingtoncorpus/Wellington_annotation_removed.txt
+
+  echo "$0: Done copying Wellington corpus"
+fi
+
 mkdir -p data/{train,test,val}
 file_name=largeWriterIndependentTextLineRecognitionTask
 

diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
@@ -62,6 +62,7 @@ if [ $stage -le 0 ]; then
     local/remove_test_utterances_from_lob.py data/test/text data/val/text \
                                              > ${dir}/data/text/lob.txt
   cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt
+  cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt >> ${dir}/data/text/wellington.txt
 
   # use the validation data as the dev set.
   # Note: the name 'dev' is treated specially by pocolm, it automatically
@@ -81,7 +82,7 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from IAM text
-  cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/text/{iam,lob,brown,wellington.txt}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
   head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 

diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
@@ -7,6 +7,7 @@
 set -e
 stage=0
 nj=20
+decode_gmm=false
 username=
 password=
 # iam_database points to the database path on the JHU grid. If you have not
@@ -78,7 +79,7 @@ if [ $stage -le 4 ]; then
     data/lang exp/mono
 fi
 
-if [ $stage -le 5 ]; then
+if [ $stage -le 5 ] && $decode_gmm; then
   utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \
@@ -93,7 +94,7 @@ if [ $stage -le 6 ]; then
     exp/mono_ali exp/tri
 fi
 
-if [ $stage -le 7 ]; then
+if [ $stage -le 7 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \
@@ -109,7 +110,7 @@ if [ $stage -le 8 ]; then
     data/train data/lang exp/tri_ali exp/tri2
 fi
 
-if [ $stage -le 9 ]; then
+if [ $stage -le 9 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph
 
   steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \
@@ -124,7 +125,7 @@ if [ $stage -le 10 ]; then
     data/train data/lang exp/tri2_ali exp/tri3
 fi
 
-if [ $stage -le 11 ]; then
+if [ $stage -le 11 ] && $decode_gmm; then
   utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph
 
   steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \
@@ -137,7 +138,7 @@ if [ $stage -le 12 ]; then
 fi
 
 if [ $stage -le 13 ]; then
-  local/chain/run_cnn_1a.sh
+  local/chain/run_cnn_1a.sh --lang-test lang_unk
 fi
 
 if [ $stage -le 14 ]; then