Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 0 additions & 40 deletions egs/madcat_ar/v1/local/download_data.sh

This file was deleted.

69 changes: 69 additions & 0 deletions egs/madcat_ar/v1/local/prepare_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora
# 2017 Hossein Hadian
# Apache 2.0

# This script downloads the data splits for MADCAT Arabic dataset and prepares the training
# validation, and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py.
# It also uses Arabic Gigaword text corpus for language modeling.

# Eg. local/prepare_data.sh
# Eg. text file: LDC0001_000399_NHR_ARB_20070113.0052_11_LDC0001_0z11
# وهناك تداخل بين الرأسمالية الإسرائيلية
# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001
# images.scp file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1
# data/local/train/1/NHR_ARB_20070113.0052_11_LDC0001_00z1.png

download_dir1=/export/corpora/LDC/LDC2012T15/data
download_dir2=/export/corpora/LDC/LDC2013T09/data
download_dir3=/export/corpora/LDC/LDC2013T15/data
train_split_url=http://www.openslr.org/resources/48/madcat.train.raw.lineid
test_split_url=http://www.openslr.org/resources/48/madcat.test.raw.lineid
dev_split_url=http://www.openslr.org/resources/48/madcat.dev.raw.lineid
data_splits=data/download/data_splits
stage=0
download_dir=data/download
gigacorpus=data/local/gigawordcorpus
gigaword_loc=/export/corpora5/LDC/LDC2011T11
use_extra_corpus_text=true

. ./cmd.sh
. ./path.sh
. ./utils/parse_options.sh || exit 1;

if [ -d $data_splits ]; then
echo "$0: Not downloading the data splits as it is already there."
else
if [ ! -f $data_splits/madcat.train.raw.lineid ]; then
mkdir -p $data_splits
echo "$0: Downloading the data splits..."
wget -P $data_splits $train_split_url || exit 1;
wget -P $data_splits $test_split_url || exit 1;
wget -P $data_splits $dev_split_url || exit 1;
fi
echo "$0: Done downloading the data splits"
fi

if [ -d $download_dir1 ]; then
echo "$0: madcat arabic data directory is present."
else
if [ ! -f $download_dir1/madcat/*.madcat.xml ]; then
echo "$0: please download madcat data..."
fi
fi

mkdir -p $download_dir data/local
if $use_extra_corpus_text; then
mkdir -p $gigacorpus
cp -r $gigaword_loc/. $gigacorpus
for newswire in aaw_arb afp_arb ahr_arb asb_arb hyt_arb nhr_arb qds_arb umh_arb xin_arb; do
for file in $gigacorpus/arb_gw_5/data/$newswire/*.gz; do
gzip -d $file
done
for file in $gigacorpus/arb_gw_5/data/$newswire/*; do
sed -e '/^<[^>]*>$/d; s/``/"/g; s/\x27\x27/"/g' $file >> $gigacorpus/arb_gw_5/data/${newswire}_combined.txt
done
done
fi
9 changes: 5 additions & 4 deletions egs/madcat_ar/v1/run_end2end.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ images_scp_dir=data/local
overwrite=false
subset=false
augment=false
use_extra_corpus_text=true
. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
## This relates to the queue.
. ./path.sh
Expand All @@ -35,9 +36,9 @@ if [ $stage -le 0 ]; then
echo "Exiting with status 1 to avoid data corruption"
exit 1;
fi
echo "$0: Downloading data splits...$(date)"
local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3
local/prepare_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
--download_dir2 $download_dir2 --download_dir3 $download_dir3 \
--use_extra_corpus_text $use_extra_corpus_text

for set in test train dev; do
data_split_file=$data_splits_dir/madcat.$set.raw.lineid
Expand All @@ -48,7 +49,7 @@ if [ $stage -le 0 ]; then
--data data/local/$set --subset $subset --augment $augment || exit 1
done

echo "$0: Preparing data..."
echo "$0: Processing data..."
for set in dev train test; do
local/process_data.py $download_dir1 $download_dir2 $download_dir3 \
$data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \
Expand Down
13 changes: 13 additions & 0 deletions egs/rimes/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Rimes is a French handwriting recognition database created by A2iA.
The database was created by asking individuals to write letters on a given scenario like
a change of personal information, payment difficulty, damage declaration. The
dataset has been used in several international research including ICFHR 2008,
ICDAR-2009, ICDAR-2011 competitions for isolated word level and
line level recognition tasks.

It contains 11333 training lines and 788 test lines. It does not include
a validation split but in a recent publication a 10% sampling of the total
training lines for validation purposes were performed
(http://www.jpuigcerver.net/pubs/jpuigcerver_icdar2017.pdf).
We have used a similar train, test and validation split.
More info: http://www.a2ialab.com/doku.php?id=rimes_database:start
13 changes: 13 additions & 0 deletions egs/rimes/v1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# you can change cmd.sh depending on what type of queue you are using.
# If you have no queueing system and want to run on a local machine, you
# can change all instances 'queue.pl' to run.pl (but be careful and run
# commands one by one: most recipes will exhaust the memory on your
# machine). queue.pl works with GridEngine (qsub). slurm.pl works
# with slurm. Different queues are configured differently, with different
# queue names and different ways of specifying things like memory;
# to account for these differences you can create and edit the file
# conf/queue.conf to match your queue's configuration. Search for
# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.

export cmd="retry.pl queue.pl"
1 change: 1 addition & 0 deletions egs/rimes/v1/image
88 changes: 88 additions & 0 deletions egs/rimes/v1/local/chain/compare_wer.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/bin/bash

# this script is used for comparing decoding results between systems.
# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}

# Copyright 2017 Chun Chieh Chang
# 2017 Ashish Arora

if [ $# == 0 ]; then
echo "Usage: $0: <dir1> [<dir2> ... ]"
echo "e.g.: $0 exp/chain/cnn{1a,1b}"
exit 1
fi
. ./path.sh

echo "# $0 $*"
used_epochs=false

echo -n "# System "
for x in $*; do printf "% 10s" " $(basename $x)"; done
echo

echo -n "# WER "
for x in $*; do
wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER "
for x in $*; do
cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

echo -n "# WER val "
for x in $*; do
wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}')
printf "% 10s" $wer
done
echo

echo -n "# CER val "
for x in $*; do
cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}')
printf "% 10s" $cer
done
echo

if $used_epochs; then
exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems.
fi

echo -n "# Final train prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final train prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Final valid prob (xent) "
for x in $*; do
prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
printf "% 10s" $prob
done
echo

echo -n "# Parameters "
for x in $*; do
params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
printf "% 10s" $params
done
echo
1 change: 1 addition & 0 deletions egs/rimes/v1/local/chain/run_cnn_e2eali.sh
1 change: 1 addition & 0 deletions egs/rimes/v1/local/chain/run_e2e_cnn.sh
Loading