diff --git a/egs/madcat_ar/v1/local/chain/run_cnn.sh b/egs/madcat_ar/v1/local/chain/run_cnn.sh new file mode 120000 index 00000000000..df6f0a468c1 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn.sh @@ -0,0 +1 @@ +tuning/run_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh new file mode 120000 index 00000000000..a864819f542 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh @@ -0,0 +1 @@ +tuning/run_cnn_chainali_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..fcf59f917c1 --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1b.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh similarity index 100% rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh similarity index 99% rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 75c246f5ffe..55df0cad4b7 100755 --- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -193,7 +193,7 @@ if [ $stage -le 5 ]; then --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh similarity index 93% rename from egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh rename to egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 2c85e982ce6..033cb88df10 100755 --- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -5,16 +5,16 @@ # local/chain/compare_wer.sh exp/chain/e2e_cnn_1a # System e2e_cnn_1a -# WER 10.71 -# CER 2.85 -# Final train prob -0.0859 -# Final valid prob -0.1266 +# WER 7.81 +# CER 2.05 +# Final train prob -0.0812 +# Final valid prob -0.0708 # Final train prob (xent) # Final valid prob (xent) # Parameters 2.94M # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ -# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127) +# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.073->-0.073 (over 2) logprob:train/valid[64,97,final]=(-0.084,-0.080,-0.081/-0.073,-0.070,-0.071) set -e @@ -33,7 +33,7 @@ num_jobs_final=16 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= l2_regularize=0.00005 -frames_per_iter=1000000 +frames_per_iter=2000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train lang_test=lang_test @@ -125,6 +125,7 @@ if [ $stage -le 3 ]; then --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ --chain.frame-subsampling-factor 4 \ --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter $frames_per_iter \ diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index ba35f8b9ace..34e339f1877 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -60,6 +60,8 @@ help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument('--padding', type=int, default=400, help='padding across horizontal/verticle direction') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() """ @@ -209,50 +211,6 @@ def get_orientation(origin, p1, p2): return difference -def compute_hull(points): - """ - Given input list of points, return a list of points that - made up the convex hull. - Returns - ------- - [(float, float)]: convexhull points - """ - hull_points = [] - start = points[0] - min_x = start[0] - for p in points[1:]: - if p[0] < min_x: - min_x = p[0] - start = p - - point = start - hull_points.append(start) - - far_point = None - while far_point is not start: - p1 = None - for p in points: - if p is point: - continue - else: - p1 = p - break - - far_point = p1 - - for p2 in points: - if p2 is point or p2 is p1: - continue - else: - direction = get_orientation(point, far_point, p2) - if direction > 0: - far_point = p2 - - hull_points.append(far_point) - point = far_point - return hull_points - - def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. @@ -272,7 +230,6 @@ def minimum_bounding_box(points): hull_ordered = [points[index] for index in ConvexHull(points).vertices] hull_ordered.append(hull_ordered[0]) - #hull_ordered = compute_hull(points) hull_ordered = tuple(hull_ordered) min_rectangle = bounding_area(0, hull_ordered) @@ -535,16 +492,14 @@ def check_writing_condition(wc_dict, base_name): Returns (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False - - return True - + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True ### main ### - def main(): wc_dict1 = parse_writing_conditions(args.writing_condition1) @@ -564,8 +519,7 @@ def main(): madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3) if wc_dict is None or not check_writing_condition(wc_dict, base_name): continue - if madcat_file_path is not None: - get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) + get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh) if __name__ == '__main__': diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 70c5498626c..56a8443e328 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -1,7 +1,11 @@ #!/bin/bash + # Copyright 2017 Yiwen Shao # 2018 Ashish Arora +# Apache 2.0 +# This script runs the make features script in parallel. + nj=4 cmd=run.pl feat_dim=40 diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index b57500cf2fa..920cb6f700b 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -42,6 +42,8 @@ help='Path to the downloaded (and extracted) writing conditions file 2') parser.add_argument('writing_condition3', type=str, help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, + help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -97,50 +99,40 @@ def check_writing_condition(wc_dict): Returns: (bool): True if writing condition matches. """ - return True - writing_condition = wc_dict[base_name].strip() - if writing_condition != 'IUC': - return False + if args.subset: + writing_condition = wc_dict[base_name].strip() + if writing_condition != 'IUC': + return False + else: + return True - return True - -def get_word_line_mapping(madcat_file_path): +def read_text(madcat_file_path): """ Maps every word in the page image to a corresponding line. Args: - madcat_file_path (string): complete path and name of the madcat xml file + madcat_file_path (string): complete path and name of the madcat xml file corresponding to the page image. Returns: + dict: Mapping every word in the page image to a corresponding line. """ + + word_line_dict = dict() doc = minidom.parse(madcat_file_path) zone = doc.getElementsByTagName('zone') for node in zone: line_id = node.getAttribute('id') - line_word_dict[line_id] = list() word_image = node.getElementsByTagName('token-image') for tnode in word_image: word_id = tnode.getAttribute('id') - line_word_dict[line_id].append(word_id) word_line_dict[word_id] = line_id - -def read_text(madcat_file_path): - """ Maps every word in the page image to a corresponding line. - Args: - madcat_file_path (string): complete path and name of the madcat xml file - corresponding to the page image. - Returns: - dict: Mapping every word in the page image to a corresponding line. - """ text_line_word_dict = dict() - doc = minidom.parse(madcat_file_path) segment = doc.getElementsByTagName('segment') for node in segment: token = node.getElementsByTagName('token') for tnode in token: ref_word_id = tnode.getAttribute('ref_id') word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue - word = unicodedata.normalize('NFKC',word) ref_line_id = word_line_dict[ref_word_id] if ref_line_id not in text_line_word_dict: text_line_word_dict[ref_line_id] = list() @@ -160,7 +152,6 @@ def get_line_image_location(): ### main ### - print("Processing '{}' data...".format(args.out_dir)) text_file = os.path.join(args.out_dir, 'text') @@ -188,24 +179,19 @@ def get_line_image_location(): madcat_xml_path, image_file_path, wc_dict = check_file_location() if wc_dict is None or not check_writing_condition(wc_dict): continue - if madcat_xml_path is not None: - madcat_doc = minidom.parse(madcat_xml_path) - writer = madcat_doc.getElementsByTagName('writer') - writer_id = writer[0].getAttribute('id') - line_word_dict = dict() - word_line_dict = dict() - get_word_line_mapping(madcat_xml_path) - text_line_word_dict = read_text(madcat_xml_path) - base_name = os.path.basename(image_file_path) - base_name, b = base_name.split('.tif') - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 + madcat_doc = minidom.parse(madcat_xml_path) + writer = madcat_doc.getElementsByTagName('writer') + writer_id = writer[0].getAttribute('id') + text_line_word_dict = read_text(madcat_xml_path) + base_name = os.path.basename(image_file_path).split('.tif')[0] + for lineID in sorted(text_line_word_dict): + updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[lineID] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh index 2c11aba3e13..31564d25326 100755 --- a/egs/madcat_ar/v1/local/score.sh +++ b/egs/madcat_ar/v1/local/score.sh @@ -1,5 +1,5 @@ #!/bin/bash -steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" -steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@" +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh index 3b8a382cb00..b7fc0b09a46 100755 --- a/egs/madcat_ar/v1/local/train_lm.sh +++ b/egs/madcat_ar/v1/local/train_lm.sh @@ -6,20 +6,19 @@ # 2017 Hossein Hadian # Apache 2.0 # -# This script trains a LM on the MADCAT training transcriptions. +# This script trains a LM on the training transcriptions. # It is based on the example scripts distributed with PocoLM # It will check if pocolm is installed and if not will proceed with installation set -e stage=0 - +dir=data/local/local_lm +order=6 echo "$0 $@" # Print the command line for logging . ./utils/parse_options.sh || exit 1; -dir=data/local/local_lm lm_dir=${dir}/data -segments=data/train/segmented_words mkdir -p $dir @@ -43,12 +42,10 @@ bypass_metaparam_optim_opt= # These example numbers of metaparameters is for 4-gram model (with min-counts) # running with train_lm.py. # The dev perplexity should be close to the non-bypassed model. -#bypass_metaparam_optim_opt= # Note: to use these example parameters, you may need to remove the .done files # to make sure the make_lm_dir.py be called and tain only 3-gram model #for order in 3; do #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done - if [ $stage -le 0 ]; then mkdir -p ${dir}/data mkdir -p ${dir}/data/text @@ -65,7 +62,7 @@ if [ $stage -le 0 ]; then # use the training data as an additional data source. # we can later fold the dev data into this. - cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/madcat.txt + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt # for reporting perplexities, we'll use the "real" dev set. # (the validation data is used as ${dir}/data/text/dev.txt to work @@ -75,12 +72,10 @@ if [ $stage -le 0 ]; then cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt # get the wordlist from MADCAT text - cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist fi -order=3 - if [ $stage -le 1 ]; then # decide on the vocabulary. # Note: you'd use --wordlist if you had a previously determined word-list @@ -88,7 +83,7 @@ if [ $stage -le 1 ]; then # Note: if you have more than one order, use a certain amount of words as the # vocab and want to restrict max memory for 'sort', echo "$0: training the unpruned LM" - min_counts='train=2 madcat=1' + min_counts='train=1' wordlist=${dir}/data/wordlist lm_name="`basename ${wordlist}`_${order}" @@ -96,13 +91,34 @@ if [ $stage -le 1 ]; then lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" fi unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm - train_lm.py --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \ + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ --limit-unk-history=true \ ${bypass_metaparam_optim_opt} \ ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' - mkdir -p ${dir}/data/arpa format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500k n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index 14c8bf7a6ce..f6a63320497 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -11,9 +11,7 @@ decode_gmm=false # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -21,7 +19,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits - +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -34,8 +32,14 @@ mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 fi @@ -79,7 +83,7 @@ fi if [ $stage -le 5 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi @@ -132,9 +136,9 @@ if [ $stage -le 12 ]; then fi if [ $stage -le 13 ]; then - local/chain/run_cnn_1a.sh + local/chain/run_cnn.sh fi if [ $stage -le 14 ]; then - local/chain/run_cnn_chainali_1a.sh --stage 2 + local/chain/run_cnn_chainali.sh --stage 2 fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 5d27476d3e1..3986ede9d7f 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -7,9 +7,7 @@ nj=70 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not # already downloaded the database you can set it to a local directory # This corpus can be purchased here: -# https://catalog.ldc.upenn.edu/LDC2012T15, -# https://catalog.ldc.upenn.edu/LDC2013T09/, -# https://catalog.ldc.upenn.edu/LDC2013T15/. +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} download_dir1=/export/corpora/LDC/LDC2012T15/data download_dir2=/export/corpora/LDC/LDC2013T09/data download_dir3=/export/corpora/LDC/LDC2013T15/data @@ -17,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +overwrite=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. @@ -27,15 +26,17 @@ data_splits_dir=data/download/data_splits mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} - if [ $stage -le 0 ]; then - echo "$0: Downloading data splits..." - echo "Date: $(date)." + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then for dataset in test train dev; do data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ @@ -44,9 +45,7 @@ if [ $stage -le 1 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ --data data/local/$dataset done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --images_scp_dir data/local \ @@ -54,75 +53,66 @@ if [ $stage -le 2 ]; then --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 fi -if [ $stage -le 3 ]; then - echo "$0: Obtaining image groups. calling get_image2num_frames" - echo "Date: $(date)." - image/get_image2num_frames.py data/train # This will be needed for the next command - # The next command creates a "allowed_lengths.txt" file in data/train - # which will be used by local/make_features.py to enforce the images to - # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. - echo "$0: Obtaining image groups. calling get_allowed_lengths" - echo "Date: $(date)." +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train -fi -if [ $stage -le 4 ]; then for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. " - echo "Date: $(date)." + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; done - echo "$0: Fixing data directory for train dataset" - echo "Date: $(date)." + echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train fi -if [ $stage -le 5 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." cut -d' ' -f2- data/train/text | local/reverse.py | \ - local/prepend_words.py | \ - utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids cut -d' ' -f2- data/$set/text | local/reverse.py | \ - local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \ + utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. - # So we set --sil-prob to 0.0 utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ data/local/dict "" data/lang/temp data/lang utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 6 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ data/local/dict/lexicon.txt data/lang_test fi -if [ $stage -le 7 ]; then - echo "$0: Calling the flat-start chain recipe..." - echo "Date: $(date)." - local/chain/run_flatstart_cnn1a.sh --nj $nj +if [ $stage -le 4 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/chain/run_e2e_cnn.sh --nj $nj fi -if [ $stage -le 8 ]; then - echo "$0: Aligning the training data using the e2e chain model..." - echo "Date: $(date)." +if [ $stage -le 5 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ --use-gpu false \ --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi -if [ $stage -le 9 ]; then - echo "$0: Building a tree and training a regular chain model using the e2e alignments..." - echo "Date: $(date)." - local/chain/run_cnn_e2eali_1b.sh --nj $nj +if [ $stage -le 6 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" + local/chain/run_cnn_e2eali.sh --nj $nj fi