diff --git a/egs/heroico/s5/cmd.sh b/egs/heroico/s5/cmd.sh index a427f3c16a5..533aad25db1 100755 --- a/egs/heroico/s5/cmd.sh +++ b/egs/heroico/s5/cmd.sh @@ -10,6 +10,7 @@ # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export cmd="retry.pl queue.pl" export train_cmd="retry.pl queue.pl" export decode_cmd="retry.pl queue.pl --mem 2G" diff --git a/egs/heroico/s5/local/heroico_download.sh b/egs/heroico/s5/local/heroico_download.sh deleted file mode 100755 index 9c58fe37537..00000000000 --- a/egs/heroico/s5/local/heroico_download.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash - -# Copyright 2018 John Morgan -# Apache 2.0. - -speech=$1 -lexicon=$2 - -download_dir=$(pwd) -tmpdir=data/local/tmp -data_dir=$tmpdir/LDC2006S37/data - -mkdir -p $tmpdir - -# download the corpus from openslr - -if [ ! -f $download_dir/heroico.tar.gz ]; then - wget -O $download_dir/heroico.tar.gz $speech - - ( - cd $download_dir - tar -xzf heroico.tar.gz - ) -fi - -mkdir -p data/local/dict $tmpdir/dict - -# download the dictionary from openslr - -if [ ! -f $download_dir/santiago.tar.gz ]; then - wget -O $download_dir/santiago.tar.gz $lexicon -fi - -( - cd $download_dir - tar -xzf santiago.tar.gz -) diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl index a7e0cfb0c6e..e39db79f610 100755 --- a/egs/heroico/s5/local/subs_prepare_data.pl +++ b/egs/heroico/s5/local/subs_prepare_data.pl @@ -19,7 +19,7 @@ # input and output files -my $corpus = "OpenSubtitles2018.en-es.es"; +my $corpus = "OpenSubtitles.en-es.es"; my $symbol_table = "data/lang/words.txt"; my $filtered = "data/local/tmp/subs/lm/es.txt"; my $oovs = "data/local/tmp/subs/lm/oovs.txt"; diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh index 67ad87e55f9..4cc5617e985 100755 --- a/egs/heroico/s5/run.sh +++ b/egs/heroico/s5/run.sh @@ -9,11 +9,11 @@ stage=0 datadir=/export/corpora5/LDC/LDC2006S37 # The corpus and lexicon are on openslr.org -speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz" -lexicon="http://www.openslr.org/resources/34/santiago.tar.gz" +#speech_url="http://www.openslr.org/resources/39/LDC2006S37.tar.gz" +lexicon_url="http://www.openslr.org/resources/34/santiago.tar.gz" # Location of the Movie subtitles text corpus -subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip" +subtitles_url="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip" . utils/parse_options.sh @@ -26,14 +26,22 @@ set -u tmpdir=data/local/tmp if [ $stage -le 0 ]; then - # download the corpus from openslr - local/heroico_download.sh $speech $lexicon + if [ ! -d $datadir ]; then + echo "$0: please download and un-tar http://www.openslr.org/resources/39/LDC2006S37.tar.gz" + echo " and set $datadir to the directory where it is located." + exit 1 + fi + if [ ! -s santiago.txt ]; then + echo "$0: downloading the lexicon" + wget -c http://www.openslr.org/resources/34/santiago.tar.gz + tar -xvzf santiago.tar.gz + fi # Get data for lm training - local/subs_download.sh $subs_src + local/subs_download.sh $subtitles_url fi if [ $stage -le 1 ]; then - echo "Makin lists for building models." + echo "Making lists for building models." local/prepare_data.sh $datadir fi