diff --git a/egs/iam/v1/local/prepare_data.sh b/egs/iam/v1/local/prepare_data.sh index e751d5ff71a..9f969998340 100755 --- a/egs/iam/v1/local/prepare_data.sh +++ b/egs/iam/v1/local/prepare_data.sh @@ -43,6 +43,7 @@ xml=data/local/xml ascii=data/local/ascii bcorpus=data/local/browncorpus lobcorpus=data/local/lobcorpus +wcorpus=data/local/wellingtoncorpus data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz @@ -50,6 +51,7 @@ data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndep ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +wellington_corpus_loc=/export/corpora5/Wellington/WWC/ mkdir -p $download_dir data/local # download and extact images and transcription @@ -124,6 +126,38 @@ else echo "$0: Done downloading the Brown text corpus" fi +if [ -d $wcorpus ]; then + echo "$0: Not copying Wellington corpus as it is already there." +else + mkdir -p $wcorpus + cp -r $wellington_corpus_loc/. $wcorpus + + # Combine Wellington corpora and replace some of their annotations + cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ + cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt + + cat data/local/wellingtoncorpus/Wellington_annotated.txt | python3 <( + cat << EOF +import sys, io, re; +from collections import OrderedDict; +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); +dict=OrderedDict([("^",""), ("|",""), ("_",""), ("*0",""), ("*1",""), ("*2",""), ("*3",""), ("*4",""), + ("*5",""), ("*6",""), ("*7",""), ("*8",""), ("*9",""), ("*@","°"), ("**=",""), ("*=",""), + ("*+$",""), ("$",""), ("*+","£"), ("*-","-"), ("*/","*"), ("*|",""), ("*{","{"), ("*}","}"), + ("**#",""), ("*#",""), ("*?",""), ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), + ("*<",""), ("*>",""), ("**[",""), ("**]",""), ("**;",""), ("*;",""), ("**:",""), ("*:",""), + ("\\\0",""), ("\\\15",""), ("\\\1",""), ("\\\2",""), ("\\\3",""), ("\\\6",""), ("\\\",""), + ("{0",""), ("{15",""), ("{1",""), ("{2",""), ("{3",""), ("{6","")]); +pattern = re.compile("|".join(re.escape(key) for key in dict.keys()) + "|[^\\*]\\}"); +dict["}"]=""; +[sys.stdout.write(pattern.sub(lambda x: dict[x.group()[1:]] if re.match('[^\\*]\\}', x.group()) else dict[x.group()], line)) for line in sys.stdin]; +EOF +) > data/local/wellingtoncorpus/Wellington_annotation_removed.txt + + echo "$0: Done copying Wellington corpus" +fi + mkdir -p data/{train,test,val} file_name=largeWriterIndependentTextLineRecognitionTask diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index c47284e7692..0e1423af36d 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -62,6 +62,7 @@ if [ $stage -le 0 ]; then local/remove_test_utterances_from_lob.py data/test/text data/val/text \ > ${dir}/data/text/lob.txt cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt + cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt >> ${dir}/data/text/wellington.txt # use the validation data as the dev set. # Note: the name 'dev' is treated specially by pocolm, it automatically @@ -81,7 +82,7 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from IAM text - cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/{iam,lob,brown,wellington.txt}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh index 00c9f682bf2..c66baf069f9 100755 --- a/egs/iam/v1/run.sh +++ b/egs/iam/v1/run.sh @@ -7,6 +7,7 @@ set -e stage=0 nj=20 +decode_gmm=false username= password= # iam_database points to the database path on the JHU grid. If you have not @@ -78,7 +79,7 @@ if [ $stage -le 4 ]; then data/lang exp/mono fi -if [ $stage -le 5 ]; then +if [ $stage -le 5 ] && $decode_gmm; then utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ @@ -93,7 +94,7 @@ if [ $stage -le 6 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 7 ]; then +if [ $stage -le 7 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ @@ -109,7 +110,7 @@ if [ $stage -le 8 ]; then data/train data/lang exp/tri_ali exp/tri2 fi -if [ $stage -le 9 ]; then +if [ $stage -le 9 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri2 exp/tri2/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri2/graph \ @@ -124,7 +125,7 @@ if [ $stage -le 10 ]; then data/train data/lang exp/tri2_ali exp/tri3 fi -if [ $stage -le 11 ]; then +if [ $stage -le 11 ] && $decode_gmm; then utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph steps/decode_fmllr.sh --nj $nj --cmd $cmd exp/tri3/graph \ @@ -137,7 +138,7 @@ if [ $stage -le 12 ]; then fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh + local/chain/run_cnn_1a.sh --lang-test lang_unk fi if [ $stage -le 14 ]; then