kaldi-asr · danpovey · Dec 28, 2018 · Dec 23, 2018 · Dec 23, 2018 · Dec 23, 2018
diff --git a/egs/madcat_ar/v1/local/download_data.sh b/egs/madcat_ar/v1/local/download_data.sh
diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+#                2017  Hossein Hadian
+# Apache 2.0
+
+# This script downloads the data splits for MADCAT Arabic dataset and prepares the training
+# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
+# It also uses Arabic Gigaword text corpus for language modeling.
+
+#  Eg. local/prepare_data.sh
+#  Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11 
+#                 وهناك تداخل بين الرأسمالية الإسرائيلية
+#      utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
+#      images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 
+#                        data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png
+
+download_dir1=/export/corpora/LDC/LDC2012T15/data
+download_dir2=/export/corpora/LDC/LDC2013T09/data
+download_dir3=/export/corpora/LDC/LDC2013T15/data
+train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
+test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
+dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
+data_splits=data/download/data_splits
+stage=0
+download_dir=data/download
+gigacorpus=data/local/gigawordcorpus
+gigaword_loc=/export/corpora5/LDC/LDC2011T11
+use_extra_corpus_text=true
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+if [ -d $data_splits ]; then
+  echo "$0: Not downloading the data splits as it is already there."
+else
+  if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
+    mkdir -p $data_splits
+    echo "$0: Downloading the data splits..."
+    wget -P $data_splits $train_split_url || exit 1;
+    wget -P $data_splits $test_split_url || exit 1;
+    wget -P $data_splits $dev_split_url || exit 1;
+  fi
+  echo "$0: Done downloading the data splits"
+fi
+
+if [ -d $download_dir1 ]; then
+  echo "$0: madcat arabic data directory is present."
+else
+  if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
+    echo "$0: please download madcat data..."
+  fi
+fi
+
+mkdir -p $download_dir data/local
+if $use_extra_corpus_text; then
+  mkdir -p $gigacorpus
+  cp -r $gigaword_loc/. $gigacorpus
+  for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
+    for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
+      gzip -d $file
+    done
+    for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
+      sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
+    done
+  done
+fi
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
@@ -19,6 +19,7 @@ images_scp_dir=data/local
 overwrite=false
 subset=false
 augment=false
+use_extra_corpus_text=true
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
@@ -35,9 +36,9 @@ if [ $stage -le 0 ]; then
     echo "Exiting with status 1 to avoid data corruption"
     exit 1;
   fi
-  echo "$0: Downloading data splits...$(date)"
-  local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
-                         --download_dir2 $download_dir2 --download_dir3 $download_dir3
+  local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
+                         --download_dir2 $download_dir2 --download_dir3 $download_dir3 \
+                         --use_extra_corpus_text $use_extra_corpus_text
 
   for set in test train dev; do
     data_split_file=$data_splits_dir/madcat.$set.raw.lineid
@@ -48,7 +49,7 @@ if [ $stage -le 0 ]; then
         --data data/local/$set --subset $subset --augment $augment || exit 1
   done
 
-  echo "$0: Preparing data..."
+  echo "$0: Processing data..."
   for set in dev train test; do
     local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
       $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \

diff --git a/egs/rimes/README.txt b/egs/rimes/README.txt
@@ -0,0 +1,13 @@
+Rimes is a French handwriting recognition database created by A2iA.
+The database was created by asking individuals to write letters on a given scenario like
+a change of personal information, payment difficulty, damage declaration. The
+dataset has been used in several international research including ICFHR 2008,
+ICDAR-2009, ICDAR-2011 competitions for isolated word level and
+line level recognition tasks.
+
+It contains 11333 training lines and 788 test lines. It does not include
+a validation split but in a recent publication a 10% sampling of the total
+training lines for validation purposes were performed
+(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf).
+We have used a similar train, test and validation split.
+More info: http://www.a2ialab.com/doku.php?id=rimes_database:start
diff --git a/egs/rimes/v1/cmd.sh b/egs/rimes/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="retry.pl queue.pl"
diff --git a/egs/rimes/v1/image b/egs/rimes/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
diff --git a/egs/rimes/v1/local/chain/compare_wer.sh b/egs/rimes/v1/local/chain/compare_wer.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+. ./path.sh
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+echo -n "# WER val                    "
+for x in $*; do
+  wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER val                    "
+for x in $*; do
+  cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/rimes/v1/local/chain/run_cnn_e2eali.sh b/egs/rimes/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1a.sh
diff --git a/egs/rimes/v1/local/chain/run_e2e_cnn.sh b/egs/rimes/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh