diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index 07f3cb12257..a11cbcc7a82 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -45,10 +45,13 @@ 'and right side of the image.') parser.add_argument('--num-channels', type=int, default=1, help='Number of color channels') +parser.add_argument('--vertical-shift', type=int, default=0, + help='total number of padding pixel per column') parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False, help="Flip the image left-right for right to left languages") -parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, - help="performs image augmentation") +parser.add_argument('--augment_type', type=str, default='no_aug', + choices=['no_aug', 'random_scale','random_shift'], + help='Subset of data to process.') args = parser.parse_args() @@ -68,7 +71,6 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") - def horizontal_pad(im, allowed_lengths = None): if allowed_lengths is None: left_padding = right_padding = args.padding @@ -112,6 +114,33 @@ def get_scaled_image_aug(im, mode='normal'): return im_scaled_up return im +def vertical_shift(im, mode='normal'): + if args.vertical_shift == 0: + return im + total = args.vertical_shift + if mode == 'notmid': + val = random.randint(0, 1) + if val == 0: + mode = 'top' + else: + mode = 'bottom' + if mode == 'normal': + top = int(total / 2) + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int) - + np.random.normal(2, 1, (top, width)).astype(int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int) - + np.random.normal(2, 1, (bottom, width)).astype(int)), axis=0) + return im_pad ### main ### random.seed(1) @@ -134,7 +163,6 @@ def get_scaled_image_aug(im, mode='normal'): num_fail = 0 num_ok = 0 -aug_setting = ['normal', 'scaled'] with open(data_list_path) as f: for line in f: line = line.strip() @@ -144,21 +172,25 @@ def get_scaled_image_aug(im, mode='normal'): im = misc.imread(image_path) if args.fliplr: im = np.fliplr(im) - if args.augment: - im_aug = get_scaled_image_aug(im, aug_setting[1]) - else: - im_aug = get_scaled_image_aug(im, aug_setting[0]) - im_horizontal_padded = horizontal_pad(im_aug, allowed_lengths) - if im_horizontal_padded is None: + if args.augment_type == 'no_aug' or 'random_shift': + im = get_scaled_image_aug(im, 'normal') + elif args.augment_type == 'random_scale': + im = get_scaled_image_aug(im, 'scaled') + im = horizontal_pad(im, allowed_lengths) + if im is None: num_fail += 1 continue + if args.augment_type == 'no_aug' or 'random_scale': + im = vertical_shift(im, 'normal') + elif args.augment_type == 'random_shift': + im = vertical_shift(im, 'notmid') if args.num_channels == 1: - data = np.transpose(im_horizontal_padded, (1, 0)) + data = np.transpose(im, (1, 0)) elif args.num_channels == 3: - H = im_horizontal_padded.shape[0] - W = im_horizontal_padded.shape[1] - C = im_horizontal_padded.shape[2] - data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C)) + H = im.shape[0] + W = im.shape[1] + C = im.shape[2] + data = np.reshape(np.transpose(im, (1, 0, 2)), (W, H * C)) data = np.divide(data, 255.0) num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) diff --git a/egs/madcat_ar/v1/local/chain/compare_wer.sh b/egs/madcat_ar/v1/local/chain/compare_wer.sh index ad90710b13f..7f04061dafb 100755 --- a/egs/madcat_ar/v1/local/chain/compare_wer.sh +++ b/egs/madcat_ar/v1/local/chain/compare_wer.sh @@ -27,6 +27,13 @@ for x in $*; do done echo +echo -n "# WER (rescored) " +for x in $*; do + wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + echo -n "# CER " for x in $*; do cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') @@ -34,6 +41,13 @@ for x in $*; do done echo +echo -n "# CER (rescored) " +for x in $*; do + cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + if $used_epochs; then exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. fi diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh index a3a98ce5ad5..eb140e900e1 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh @@ -21,18 +21,16 @@ reporting_email= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 # we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -168,13 +166,13 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ - --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=4 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -183,10 +181,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -207,18 +201,20 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh index b652eab034a..5b3597a3915 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh @@ -18,18 +18,15 @@ lats_affix= # chain options train_stage=-10 xent_regularize=0.1 -frame_subsampling_factor=4 # training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 remove_egs=false -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -170,13 +167,13 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ - --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ --chain.alignment-subsampling-factor=1 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ --trainer.num-epochs=4 \ - --trainer.frames-per-iter=1000000 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ @@ -185,10 +182,6 @@ if [ $stage -le 5 ]; then --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -209,18 +202,20 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh index 38387ce2fcc..ee84ea0d83f 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -19,17 +19,14 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -171,28 +168,24 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ --chain.right-tolerance 3 \ --trainer.srand=$srand \ --trainer.max-param-change=2.0 \ - --trainer.num-epochs=2 \ - --trainer.frames-per-iter=1000000 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=2000000 \ --trainer.optimization.num-jobs-initial=3 \ --trainer.optimization.num-jobs-final=16 \ --trainer.optimization.initial-effective-lrate=0.001 \ --trainer.optimization.final-effective-lrate=0.0001 \ --trainer.optimization.shrink-value=1.0 \ - --trainer.num-chunk-per-minibatch=96,64 \ + --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0" \ --cleanup.remove-egs=$remove_egs \ @@ -213,18 +206,20 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh index 55df0cad4b7..c6052b76e7f 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -32,17 +32,14 @@ reporting_email= train_stage=-10 xent_regularize=0.1 frame_subsampling_factor=4 -# training chunk-options chunk_width=340,300,200,100 num_leaves=500 -# we don't need extra left/right context for TDNN systems. -chunk_left_context=0 -chunk_right_context=0 tdnn_dim=450 # training options srand=0 remove_egs=true -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -106,7 +103,6 @@ if [ $stage -le 2 ]; then --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ ${train_data_dir} data/lang $e2echain_model_dir $lat_dir echo "" >$lat_dir/splice_opts - fi if [ $stage -le 3 ]; then @@ -185,7 +181,7 @@ if [ $stage -le 5 ]; then --chain.leaky-hmm-coefficient=0.1 \ --chain.l2-regularize=0.00005 \ --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --chain.frame-subsampling-factor=$frame_subsampling_factor \ --chain.alignment-subsampling-factor=1 \ --chain.left-tolerance 3 \ @@ -201,11 +197,8 @@ if [ $stage -le 5 ]; then --trainer.optimization.shrink-value=1.0 \ --trainer.num-chunk-per-minibatch=64,32 \ --trainer.optimization.momentum=0.0 \ + --trainer.add-option="--optimization.memory-compression-level=2" \ --egs.chunk-width=$chunk_width \ - --egs.chunk-left-context=$chunk_left_context \ - --egs.chunk-right-context=$chunk_right_context \ - --egs.chunk-left-context-initial=0 \ - --egs.chunk-right-context-final=0 \ --egs.dir="$common_egs_dir" \ --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ --cleanup.remove-egs=$remove_egs \ @@ -226,18 +219,20 @@ if [ $stage -le 6 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi if [ $stage -le 7 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ --frames-per-chunk $frames_per_chunk \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh index 033cb88df10..2891e50da9e 100755 --- a/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh +++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -27,16 +27,12 @@ affix=1a # training options tdnn_dim=450 -num_epochs=2 -num_jobs_initial=6 -num_jobs_final=16 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16 common_egs_dir= -l2_regularize=0.00005 -frames_per_iter=2000000 -cmvn_opts="--norm-means=true --norm-vars=true" +cmvn_opts="--norm-means=false --norm-vars=false" train_set=train -lang_test=lang_test +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g # End configuration section. echo "$0 $@" # Print the command line for logging @@ -118,7 +114,7 @@ if [ $stage -le 3 ]; then --cmd "$cmd" \ --feat.cmvn-opts "$cmvn_opts" \ --chain.leaky-hmm-coefficient 0.1 \ - --chain.l2-regularize $l2_regularize \ + --chain.l2-regularize 0.00005 \ --chain.apply-deriv-weights false \ --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ @@ -128,11 +124,11 @@ if [ $stage -le 3 ]; then --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ --trainer.add-option="--optimization.memory-compression-level=2" \ --trainer.num-chunk-per-minibatch $minibatch_size \ - --trainer.frames-per-iter $frames_per_iter \ - --trainer.num-epochs $num_epochs \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ --trainer.optimization.momentum 0 \ - --trainer.optimization.num-jobs-initial $num_jobs_initial \ - --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.num-jobs-initial 6 \ + --trainer.optimization.num-jobs-final 16 \ --trainer.optimization.initial-effective-lrate 0.001 \ --trainer.optimization.final-effective-lrate 0.0001 \ --trainer.optimization.shrink-value 1.0 \ @@ -152,7 +148,7 @@ if [ $stage -le 4 ]; then # as long as phones.txt was compatible. utils/mkgraph.sh \ - --self-loop-scale 1.0 data/$lang_test \ + --self-loop-scale 1.0 $lang_decode \ $dir $dir/graph || exit 1; fi @@ -161,6 +157,9 @@ if [ $stage -le 5 ]; then steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ --nj $nj --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 fi echo "Done. Date: $(date). Results:" diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py index 34e339f1877..778555c427e 100755 --- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py +++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py @@ -21,22 +21,10 @@ import numpy as np from math import atan2, cos, sin, pi, degrees, sqrt from collections import namedtuple - +import random from scipy.spatial import ConvexHull from PIL import Image from scipy.misc import toimage -import logging - -sys.path.insert(0, 'steps') -logger = logging.getLogger('libs') -logger.setLevel(logging.INFO) -handler = logging.StreamHandler() -handler.setLevel(logging.INFO) -formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " - "%(funcName)s - %(levelname)s ] %(message)s") -handler.setFormatter(formatter) -logger.addHandler(handler) - parser = argparse.ArgumentParser(description="Creates line images from page image", epilog="E.g. " + sys.argv[0] + " data/LDC2012T15" " data/LDC2013T09 data/LDC2013T15 data/madcat.train.raw.lineid " @@ -60,8 +48,12 @@ help='Path to the downloaded (and extracted) writing conditions file 3') parser.add_argument('--padding', type=int, default=400, help='padding across horizontal/verticle direction') +parser.add_argument('--pixel-scaling', type=int, default=30, + help='padding across horizontal/verticle direction') parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, help="only processes subset of data based on writing condition") +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") args = parser.parse_args() """ @@ -196,21 +188,6 @@ def rectangle_corners(rectangle): return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points) -def get_orientation(origin, p1, p2): - """ - Given origin and two points, return the orientation of the Point p1 with - regards to Point p2 using origin. - Returns - ------- - integer: Negative if p1 is clockwise of p2. - """ - difference = ( - ((p2[0] - origin[0]) * (p1[1] - origin[1])) - - ((p1[0] - origin[0]) * (p2[1] - origin[1])) - ) - return difference - - def minimum_bounding_box(points): """ Given a list of 2D points, it returns the minimum area rectangle bounding all the points in the point cloud. @@ -357,6 +334,36 @@ def update_minimum_bounding_box_input(bounding_box_input): return updated_minimum_bounding_box_input +def dilate_polygon(points, amount_increase): + """ Increases size of polygon given as a list of tuples. + Assumes points in polygon are given in CCW + """ + expanded_points = [] + for index, point in enumerate(points): + prev_point = points[(index - 1) % len(points)] + next_point = points[(index + 1) % len(points)] + prev_edge = np.subtract(point, prev_point) + next_edge = np.subtract(next_point, point) + + prev_normal = ((1 * prev_edge[1]), (-1 * prev_edge[0])) + prev_normal = np.divide(prev_normal, np.linalg.norm(prev_normal)) + next_normal = ((1 * next_edge[1]), (-1 * next_edge[0])) + next_normal = np.divide(next_normal, np.linalg.norm(next_normal)) + + bisect = np.add(prev_normal, next_normal) + bisect = np.divide(bisect, np.linalg.norm(bisect)) + + cos_theta = np.dot(next_normal, bisect) + hyp = amount_increase / cos_theta + + new_point = np.around(point + hyp * bisect) + new_point = new_point.astype(int) + new_point = new_point.tolist() + new_point = tuple(new_point) + expanded_points.append(new_point) + return expanded_points + + def set_line_image_data(image, line_id, image_file_name, image_fh): """ Given an image, saves a flipped line image. Line image file name is formed by appending the line id at the end page image name. @@ -395,50 +402,83 @@ def get_line_images_from_page_image(image_file_name, madcat_file_path, image_fh) word_coordinate = (int(word_node.getAttribute('x')), int(word_node.getAttribute('y'))) minimum_bounding_box_input.append(word_coordinate) updated_mbb_input = update_minimum_bounding_box_input(minimum_bounding_box_input) - bounding_box = minimum_bounding_box(updated_mbb_input) - - p1, p2, p3, p4 = bounding_box.corner_points - x1, y1 = p1 - x2, y2 = p2 - x3, y3 = p3 - x4, y4 = p4 - min_x = int(min(x1, x2, x3, x4)) - min_y = int(min(y1, y2, y3, y4)) - max_x = int(max(x1, x2, x3, x4)) - max_y = int(max(y1, y2, y3, y4)) - box = (min_x, min_y, max_x, max_y) - region_initial = im.crop(box) - rot_points = [] - p1_new = (x1 - min_x, y1 - min_y) - p2_new = (x2 - min_x, y2 - min_y) - p3_new = (x3 - min_x, y3 - min_y) - p4_new = (x4 - min_x, y4 - min_y) - rot_points.append(p1_new) - rot_points.append(p2_new) - rot_points.append(p3_new) - rot_points.append(p4_new) - - cropped_bounding_box = bounding_box_tuple(bounding_box.area, - bounding_box.length_parallel, - bounding_box.length_orthogonal, - bounding_box.length_orthogonal, - bounding_box.unit_vector, - bounding_box.unit_vector_angle, - set(rot_points) - ) - - rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) - img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) - x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + points_ordered = [updated_mbb_input[index] for index in ConvexHull(updated_mbb_input).vertices] + if args.augment: + for i in range(0, 3): + additional_pixel = random.randint(1, args.pixel_scaling) + mar = dilate_polygon(points_ordered, (i-1)*args.pixel_scaling + additional_pixel + 1) + bounding_box = minimum_bounding_box(mar) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( + cropped_bounding_box, get_center(region_initial)) + + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + line_id = id + '_scale' + str(i) + set_line_image_data(region_final, line_id, image_file_name, image_fh) + else: + bounding_box = minimum_bounding_box(points_ordered) + (x1, y1), (x2, y2), (x3, y3), (x4, y4) = bounding_box.corner_points + min_x, min_y = int(min(x1, x2, x3, x4)), int(min(y1, y2, y3, y4)) + max_x, max_y = int(max(x1, x2, x3, x4)), int(max(y1, y2, y3, y4)) + box = (min_x, min_y, max_x, max_y) + region_initial = im.crop(box) + rot_points = [] + p1, p2 = (x1 - min_x, y1 - min_y), (x2 - min_x, y2 - min_y) + p3, p4 = (x3 - min_x, y3 - min_y), (x4 - min_x, y4 - min_y) + rot_points.append(p1) + rot_points.append(p2) + rot_points.append(p3) + rot_points.append(p4) + + cropped_bounding_box = bounding_box_tuple(bounding_box.area, + bounding_box.length_parallel, + bounding_box.length_orthogonal, + bounding_box.length_orthogonal, + bounding_box.unit_vector, + bounding_box.unit_vector_angle, + set(rot_points) + ) + + rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box) + img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC) + x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points( cropped_bounding_box, get_center(region_initial)) - min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) - max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) - box = (min_x, min_y, max_x, max_y) - region_final = img2.crop(box) - set_line_image_data(region_final, id, image_file_name, image_fh) + min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4)) + max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4)) + box = (min_x, min_y, max_x, max_y) + region_final = img2.crop(box) + set_line_image_data(region_final, id, image_file_name, image_fh) def check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3): @@ -496,6 +536,8 @@ def check_writing_condition(wc_dict, base_name): writing_condition = wc_dict[base_name].strip() if writing_condition != 'IUC': return False + else: + return True else: return True diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh index 56a8443e328..9fe588f31b8 100755 --- a/egs/madcat_ar/v1/local/extract_features.sh +++ b/egs/madcat_ar/v1/local/extract_features.sh @@ -9,6 +9,8 @@ nj=4 cmd=run.pl feat_dim=40 +augment='no_aug' +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -34,9 +36,10 @@ done utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ - local/make_features.py $logdir/images.JOB.scp \ + image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim \| \ + --feat-dim $feat_dim --augment_type $augment \ + --vertical-shift $verticle_shift \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/madcat_ar/v1/local/extract_lines.sh b/egs/madcat_ar/v1/local/extract_lines.sh index 50129ad38c9..ab87836ae3a 100755 --- a/egs/madcat_ar/v1/local/extract_lines.sh +++ b/egs/madcat_ar/v1/local/extract_lines.sh @@ -11,6 +11,8 @@ writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_split_file=data/download/data_splits/madcat.dev.raw.lineid data=data/local/dev +subset=false +augment=false echo "$0 $@" . ./cmd.sh @@ -35,7 +37,7 @@ done $cmd JOB=1:$nj $log_dir/extract_lines.JOB.log \ local/create_line_image_from_page_image.py $download_dir1 $download_dir2 $download_dir3 \ $log_dir/lines.JOB.scp $data/JOB $writing_condition1 $writing_condition2 $writing_condition3 \ - || exit 1; + --subset $subset --augment $augment || exit 1; ## concatenate the .scp files together. for n in $(seq $nj); do diff --git a/egs/madcat_ar/v1/local/make_features.py b/egs/madcat_ar/v1/local/make_features.py deleted file mode 100755 index a21276d32c2..00000000000 --- a/egs/madcat_ar/v1/local/make_features.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2018 Hossein Hadian - -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - If an 'image2num_frames' file is found in the data dir, it will be used - to enforce the images to have the specified length in that file by padding - white pixels (the --padding option will be ignored in this case). This relates - to end2end chain training. - - eg. local/make_features.py data/train --feat-dim 40 -""" - -import argparse -import os -import sys -import numpy as np -from scipy import misc - -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") -parser.add_argument('images_scp_path', type=str, - help='Path of images.scp file') -parser.add_argument('--allowed_len_file_path', type=str, default=None, - help='If supplied, each images will be padded to reach the ' - 'target length (this overrides --padding).') -parser.add_argument('--out-ark', type=str, default='-', - help='Where to write the output feature file') -parser.add_argument('--feat-dim', type=int, default=40, - help='Size to scale the height of all images') -parser.add_argument('--padding', type=int, default=5, - help='Number of white pixels to pad on the left' - 'and right side of the image.') - - -args = parser.parse_args() - - -def write_kaldi_matrix(file_handle, matrix, key): - file_handle.write(key + " [ ") - num_rows = len(matrix) - if num_rows == 0: - raise Exception("Matrix is empty") - num_cols = len(matrix[0]) - - for row_index in range(len(matrix)): - if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") - file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) - if row_index != num_rows - 1: - file_handle.write("\n") - file_handle.write(" ]\n") - - -def get_scaled_image(im): - scale_size = args.feat_dim - sx = im.shape[1] # width - sy = im.shape[0] # height - scale = (1.0 * scale_size) / sy - nx = int(scale_size) - ny = int(scale * sx) - im = misc.imresize(im, (nx, ny)) - return im - - -def horizontal_pad(im, allowed_lengths = None): - if allowed_lengths is None: - left_padding = right_padding = args.padding - else: # Find an allowed length for the image - imlen = im.shape[1] # width - allowed_len = 0 - for l in allowed_lengths: - if l > imlen: - allowed_len = l - break - if allowed_len == 0: - # No allowed length was found for the image (the image is too long) - return None - padding = allowed_len - imlen - left_padding = int(padding // 2) - right_padding = padding - left_padding - dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), - dtype=int)), axis=1) - return im_pad1 - - -### main ### - -data_list_path = args.images_scp_path - -if args.out_ark == '-': - out_fh = sys.stdout -else: - out_fh = open(args.out_ark,'wb') - -allowed_lengths = None -allowed_len_handle = args.allowed_len_file_path -if os.path.isfile(allowed_len_handle): - print("Found 'allowed_lengths.txt' file...", file=sys.stderr) - allowed_lengths = [] - with open(allowed_len_handle) as f: - for line in f: - allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the " - "features.".format(len(allowed_lengths)), file=sys.stderr) - -num_fail = 0 -num_ok = 0 -with open(data_list_path) as f: - for line in f: - line = line.strip() - line_vect = line.split(' ') - image_id = line_vect[0] - image_path = line_vect[1] - im = misc.imread(image_path) - im_scaled = get_scaled_image(im) - im_horizontal_padded = horizontal_pad(im_scaled, allowed_lengths) - if im_horizontal_padded is None: - num_fail += 1 - continue - data = np.transpose(im_horizontal_padded, (1, 0)) - data = np.divide(data, 255.0) - num_ok += 1 - write_kaldi_matrix(out_fh, data, image_id) - -print('Generated features for {} images. Failed for {} (image too ' - 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/madcat_ar/v1/local/prepare_data.sh b/egs/madcat_ar/v1/local/prepare_data.sh deleted file mode 100755 index d808d736845..00000000000 --- a/egs/madcat_ar/v1/local/prepare_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora -# 2017 Hossein Hadian -# Apache 2.0 - -# This script prepares the training and test data for MADCAT Arabic dataset -# (i.e text, images.scp, utt2spk and spk2utt). It calls process_data.py. - -# Eg. local/prepare_data.sh -# Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ -# utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 -# images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 -# data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif - -stage=0 -download_dir1=/export/corpora/LDC/LDC2012T15/data -download_dir2=/export/corpora/LDC/LDC2013T09/data -download_dir3=/export/corpora/LDC/LDC2013T15/data -writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab -writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab -writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab -data_splits_dir=data/download/data_splits -images_scp_dir=data/local - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh || exit 1; - -mkdir -p data/{train,test,dev} - -if [ $stage -le 1 ]; then - echo "$0: Processing dev, train and test data..." - echo "Date: $(date)." - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.dev.raw.lineid data/dev $images_scp_dir/dev/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.test.raw.lineid data/test $images_scp_dir/test/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ - $data_splits_dir/madcat.train.raw.lineid data/train $images_scp_dir/train/images.scp \ - $writing_condition1 $writing_condition2 $writing_condition3 || exit 1 - - for dataset in dev test train; do - echo "$0: Fixing data directory for dataset: $dataset" - echo "Date: $(date)." - image/fix_data_dir.sh data/$dataset - done -fi diff --git a/egs/madcat_ar/v1/local/prepend_words.py b/egs/madcat_ar/v1/local/prepend_words.py deleted file mode 100755 index d53eb8974bf..00000000000 --- a/egs/madcat_ar/v1/local/prepend_words.py +++ /dev/null @@ -1,13 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -# This script, prepend '|' to every words in the transcript to mark -# the beginning of the words for finding the initial-space of every word -# after decoding. - -import sys, io - -infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') -output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') -for line in infile: - output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py index 920cb6f700b..e476b67cb96 100755 --- a/egs/madcat_ar/v1/local/process_data.py +++ b/egs/madcat_ar/v1/local/process_data.py @@ -42,6 +42,8 @@ help='Path to the downloaded (and extracted) writing conditions file 2') parser.add_argument('writing_condition3', type=str, help='Path to the downloaded (and extracted) writing conditions file 3') +parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False, + help="performs image augmentation") parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False, help="only processes subset of data based on writing condition") args = parser.parse_args() @@ -103,6 +105,8 @@ def check_writing_condition(wc_dict): writing_condition = wc_dict[base_name].strip() if writing_condition != 'IUC': return False + else: + return True else: return True @@ -184,14 +188,30 @@ def get_line_image_location(): writer_id = writer[0].getAttribute('id') text_line_word_dict = read_text(madcat_xml_path) base_name = os.path.basename(image_file_path).split('.tif')[0] - for lineID in sorted(text_line_word_dict): - updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png' - location = image_loc_dict[updated_base_name] - image_file_path = os.path.join(location, updated_base_name) - line = text_line_word_dict[lineID] - text = ' '.join(line) - utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4) - text_fh.write(utt_id + ' ' + text + '\n') - utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') - image_fh.write(utt_id + ' ' + image_file_path + '\n') - image_num += 1 + for line_id in sorted(text_line_word_dict): + if args.augment: + key = (line_id + '.')[:-1] + for i in range(0, 3): + location_id = '_' + line_id + '_scale' + str(i) + line_image_file_name = base_name + location_id + '.png' + location = image_loc_dict[line_image_file_name] + image_file_path = os.path.join(location, line_image_file_name) + line = text_line_word_dict[key] + text = ' '.join(line) + base_line_image_file_name = line_image_file_name.split('.png')[0] + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_line_image_file_name + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 + else: + updated_base_name = base_name + '_' + str(line_id).zfill(4) +'.png' + location = image_loc_dict[updated_base_name] + image_file_path = os.path.join(location, updated_base_name) + line = text_line_word_dict[line_id] + text = ' '.join(line) + utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(line_id).zfill(4) + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + image_num += 1 diff --git a/egs/madcat_ar/v1/local/tl/augment_data.sh b/egs/madcat_ar/v1/local/tl/augment_data.sh new file mode 100755 index 00000000000..cc44aa58a62 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/augment_data.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright 2018 Hossein Hadian +# 2018 Ashish Arora + +# Apache 2.0 +# This script performs data augmentation. + +nj=4 +cmd=run.pl +feat_dim=40 +verticle_shift=0 +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +srcdir=$1 +outdir=$2 +datadir=$3 +aug_set=aug1 +mkdir -p $datadir/augmentations +echo "copying $srcdir to $datadir/augmentations/$aug_set, allowed length, creating feats.scp" + +for set in $aug_set; do + image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \ + $srcdir $datadir/augmentations/$set + cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ + --vertical-shift $verticle_shift \ + --augment 'random_shift' $datadir/augmentations/$set +done + +echo " combine original data and data from different augmentations" +utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/$aug_set +cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt diff --git a/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh new file mode 100755 index 00000000000..e0cca104f50 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/chain/run_cnn_e2eali.sh @@ -0,0 +1,229 @@ +#!/bin/bash + +# ./local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a/ +# System cnn_e2eali_1a +# WER 16.78 +# CER 5.22 +# Final train prob -0.1189 +# Final valid prob -0.1319 +# Final train prob (xent) -0.6395 +# Final valid prob (xent) -0.6732 +# Parameters 3.73M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a/ +# exp/chain/cnn_e2eali_1a/: num-iters=24 nj=3..15 num-params=3.7M dim=56->392 combine=-0.125->-0.125 (over 1) xent:train/valid[15,23,final]=(-0.850,-1.24,-0.640/-0.901,-1.31,-0.673) logprob:train/valid[15,23,final]=(-0.149,-0.209,-0.119/-0.166,-0.229,-0.132) +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +tdnn_dim=450 +srand=0 +remove_egs=true +lang_decode=data/lang +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts + +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 4 \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=4 \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=2 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=16 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..3fca8cf5fdc --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/chain/run_e2e_cnn.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1a +# WER 19.30 +# CER 5.72 +# Final train prob -0.0734 +# Final valid prob -0.0607 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 3.30M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=24 nj=3..15 num-params=3.3M dim=56->292 combine=-0.060->-0.060 (over 1) logprob:train/valid[15,23,final]=(-0.122,-0.143,-0.073/-0.105,-0.132,-0.061) + +set -e + + +# configs for 'chain' +stage=0 +nj=30 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4 +common_egs_dir= +frames_per_iter=1000000 +cmvn_opts="--norm-means=false --norm-vars=false" +train_set=train +lang_decode=data/lang + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \ + --shared-phones true \ + --type mono \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=56 name=input + conv-relu-batchnorm-layer name=cnn1 height-in=56 height-out=56 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=56 height-out=28 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=28 height-out=28 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=28 height-out=14 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=14 height-out=14 time-offsets=-4,0,4 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 2000000 \ + --trainer.num-epochs 2 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 3 \ + --trainer.optimization.num-jobs-final 16 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/madcat_ar/v1/local/tl/process_waldo_data.py b/egs/madcat_ar/v1/local/tl/process_waldo_data.py new file mode 100755 index 00000000000..0d278e64122 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/process_waldo_data.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +""" This script reads image and transcription mapping and creates the following files :text, utt2spk, images.scp. + Eg. local/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + Eg. text file: LDC0001_000404_NHR_ARB_20070113.0052_11_LDC0001_00z2 ﻮﺠﻫ ﻮﻌﻘﻟ ﻍﺍﺮﻗ ﺢﺗّﻯ ﺎﻠﻨﺧﺎﻋ + utt2spk file: LDC0001_000397_NHR_ARB_20070113.0052_11_LDC0001_00z1 LDC0001 + images.scp file: LDC0009_000000_arb-NG-2-76513-5612324_2_LDC0009_00z0 + data/local/lines/1/arb-NG-2-76513-5612324_2_LDC0009_00z0.tif +""" + +import argparse +import os +import sys + +parser = argparse.ArgumentParser(description="Creates text, utt2spk and images.scp files", + epilog="E.g. " + sys.argv[0] + " data/train data/local/lines ", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument('image_transcription_file', type=str, + help='Path to the file containing line image path and transcription information') +parser.add_argument('out_dir', type=str, + help='directory location to write output files.') +args = parser.parse_args() + + +def read_image_text(image_text_path): + """ Given the file path containing, mapping information of line image + and transcription, it returns a dict. The dict contains this mapping + info. It can be accessed via line_id and will provide transcription. + Returns: + -------- + dict: line_id and transcription mapping + """ + image_transcription_dict = dict() + with open(image_text_path, encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + image_path = line_vect[0] + line_id = os.path.basename(image_path).split('.png')[0] + transcription = line_vect[1:] + joined_transcription = list() + for word in transcription: + joined_transcription.append(word) + joined_transcription = " ".join(joined_transcription) + image_transcription_dict[line_id] = joined_transcription + return image_transcription_dict + + +### main ### +print("Processing '{}' data...".format(args.out_dir)) +text_file = os.path.join(args.out_dir, 'text') +text_fh = open(text_file, 'w', encoding='utf-8') +utt2spk_file = os.path.join(args.out_dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8') +image_file = os.path.join(args.out_dir, 'images.scp') +image_fh = open(image_file, 'w', encoding='utf-8') + +image_transcription_dict = read_image_text(args.image_transcription_file) +for line_id in sorted(image_transcription_dict.keys()): + writer_id = line_id.strip().split('_')[-3] + updated_line_id = line_id + '.png' + image_file_path = os.path.join('lines', updated_line_id) + text = image_transcription_dict[line_id] + utt_id = line_id + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') + diff --git a/egs/madcat_ar/v1/local/tl/run_text_localization.sh b/egs/madcat_ar/v1/local/tl/run_text_localization.sh new file mode 100755 index 00000000000..8d12f7d802f --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/run_text_localization.sh @@ -0,0 +1,143 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian +# 2018 Ashish Arora + +# This script performs full page text recognition on automatically extracted line images +# from madcat arabic data. It is created as a separate scrip, because it performs +# data augmentation, uses smaller language model and calls process_waldo_data for +# test images (automatically extracted line images). Data augmentation increases image +# height hence requires different DNN arachitecture and different chain scripts. + +set -e +stage=0 +nj=70 +# download_dir{1,2,3} points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# This corpus can be purchased here: +# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/} +download_dir1=/export/corpora/LDC/LDC2012T15/data +download_dir2=/export/corpora/LDC/LDC2013T09/data +download_dir3=/export/corpora/LDC/LDC2013T15/data +writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab +writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab +writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab +data_splits_dir=data/download/data_splits +images_scp_dir=data/local +overwrite=false +subset=true +augment=true +verticle_shift=16 +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. +./local/check_tools.sh + +mkdir -p data/{train,test,dev}/data +mkdir -p data/local/{train,test,dev} +if [ $stage -le 0 ]; then + + if [ -f data/train/text ] && ! $overwrite; then + echo "$0: Not processing, probably script have run from wrong stage" + echo "Exiting with status 1 to avoid data corruption" + exit 1; + fi + echo "$0: Downloading data splits...$(date)" + local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ + --download_dir2 $download_dir2 --download_dir3 $download_dir3 + + for set in train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid + local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ + --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ + --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ + --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ + --data data/local/$set --subset $subset --augment $augment || exit 1 + done + + echo "$0: Preparing data..." + for set in dev train; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} + done + + local/tl/process_waldo_data.py lines/hyp_line_image_transcription_mapping_kaldi.txt data/test + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi + +if [ $stage -le 1 ]; then + echo "$0: Obtaining image groups. calling get_image2num_frames $(date)." + image/get_image2num_frames.py data/train + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + for set in dev train test; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 \ + --verticle_shift $verticle_shift data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; + done + echo "$0: Fixing data directory for train dataset $(date)." + image/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + for set in train; do + echo "$(date) stage 2: Performing augmentation, it will double training data" + local/tl/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 \ + --verticle_shift $verticle_shift data/${set} data/${set}_aug data + steps/compute_cmvn_stats.sh data/${set}_aug || exit 1; + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev train_aug; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang +fi + +if [ $stage -le 4 ]; then + echo "$0: Estimating a language model for decoding..." + local/tl/train_lm.sh --order 3 + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \ + data/local/dict/lexicon.txt data/lang +fi + +nj=30 +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe... $(date)." + local/tl/chain/run_e2e_cnn.sh --nj $nj --train_set train_aug +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model...$(date)." + steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train_aug data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)" + local/tl/chain/run_cnn_e2eali.sh --nj $nj --train_set train_aug +fi diff --git a/egs/madcat_ar/v1/local/tl/train_lm.sh b/egs/madcat_ar/v1/local/tl/train_lm.sh new file mode 100755 index 00000000000..524bb2e9f40 --- /dev/null +++ b/egs/madcat_ar/v1/local/tl/train_lm.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains a LM on the training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +dir=data/local/local_lm +order=3 +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/dev/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/train.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from MADCAT text + cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz +fi diff --git a/egs/madcat_ar/v1/local/wer_output_filter b/egs/madcat_ar/v1/local/wer_output_filter index c0f03e7178a..d6d46f3f565 100755 --- a/egs/madcat_ar/v1/local/wer_output_filter +++ b/egs/madcat_ar/v1/local/wer_output_filter @@ -2,6 +2,9 @@ # Copyright 2012-2014 Johns Hopkins University (Author: Yenda Trmal) # Apache 2.0 +# This script converts a BPE-encoded text to normal text and performs normalization. +# It is used in scoring. + use utf8; use open qw(:encoding(utf8)); diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh index f6a63320497..d3937582662 100755 --- a/egs/madcat_ar/v1/run.sh +++ b/egs/madcat_ar/v1/run.sh @@ -32,7 +32,6 @@ mkdir -p data/{train,test,dev}/data mkdir -p data/local/{train,test,dev} if [ $stage -le 0 ]; then - if [ -f data/train/text ] && ! $overwrite; then echo "$0: Not processing, probably script have run from wrong stage" echo "Exiting with status 1 to avoid data corruption" @@ -42,30 +41,27 @@ if [ $stage -le 0 ]; then echo "$0: Downloading data splits...$(date)" local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 -fi -if [ $stage -le 1 ]; then - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 done -fi -if [ $stage -le 2 ]; then echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} + done fi -mkdir -p data/{train,test,dev}/data -if [ $stage -le 3 ]; then +if [ $stage -le 1 ]; then for dataset in test train; do local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset steps/compute_cmvn_stats.sh data/$dataset || exit 1; @@ -73,33 +69,53 @@ if [ $stage -le 3 ]; then utils/fix_data_dir.sh data/train fi -if [ $stage -le 4 ]; then - echo "$0: Preparing dictionary and lang..." +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + + for set in test train dev; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + rm -f data/$set/bpe_text data/$set/ids + done + + echo "$0:Preparing dictionary and lang..." local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ - data/local/dict "" data/lang/temp data/lang + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang fi -if [ $stage -le 5 ]; then +if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_small.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi -if [ $stage -le 6 ]; then +if [ $stage -le 4 ]; then steps/train_mono.sh --nj $nj --cmd $cmd --totgauss 10000 data/train \ data/lang exp/mono fi -if [ $stage -le 7 ] && $decode_gmm; then - utils/mkgraph.sh --mono data/lang_test exp/mono exp/mono/graph +if [ $stage -le 5 ] && $decode_gmm; then + utils/mkgraph.sh --mono data/lang exp/mono exp/mono/graph steps/decode.sh --nj $nj --cmd $cmd exp/mono/graph data/test \ exp/mono/decode_test fi -if [ $stage -le 8 ]; then +if [ $stage -le 6 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/mono exp/mono_ali @@ -107,14 +123,14 @@ if [ $stage -le 8 ]; then exp/mono_ali exp/tri fi -if [ $stage -le 9 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri exp/tri/graph +if [ $stage -le 7 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri exp/tri/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri/graph data/test \ exp/tri/decode_test fi -if [ $stage -le 10 ]; then +if [ $stage -le 8 ]; then steps/align_si.sh --nj $nj --cmd $cmd data/train data/lang \ exp/tri exp/tri_ali @@ -123,22 +139,22 @@ if [ $stage -le 10 ]; then data/train data/lang exp/tri_ali exp/tri3 fi -if [ $stage -le 11 ] && $decode_gmm; then - utils/mkgraph.sh data/lang_test exp/tri3 exp/tri3/graph +if [ $stage -le 9 ] && $decode_gmm; then + utils/mkgraph.sh data/lang exp/tri3 exp/tri3/graph steps/decode.sh --nj $nj --cmd $cmd exp/tri3/graph \ data/test exp/tri3/decode_test fi -if [ $stage -le 12 ]; then +if [ $stage -le 10 ]; then steps/align_fmllr.sh --nj $nj --cmd $cmd --use-graphs true \ data/train data/lang exp/tri3 exp/tri3_ali fi -if [ $stage -le 13 ]; then +if [ $stage -le 11 ]; then local/chain/run_cnn.sh fi -if [ $stage -le 14 ]; then +if [ $stage -le 12 ]; then local/chain/run_cnn_chainali.sh --stage 2 fi diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index 3986ede9d7f..de67e444f39 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -15,8 +15,10 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab data_splits_dir=data/download/data_splits +images_scp_dir=data/local overwrite=false - +subset=false +augment=false . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. ## This relates to the queue. . ./path.sh @@ -37,20 +39,23 @@ if [ $stage -le 0 ]; then local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \ --download_dir2 $download_dir2 --download_dir3 $download_dir3 - for dataset in test train dev; do - data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid + for set in test train dev; do + data_split_file=$data_splits_dir/madcat.$set.raw.lineid local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \ --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ --download_dir3 $download_dir3 --writing_condition1 $writing_condition1 \ --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \ - --data data/local/$dataset + --data data/local/$set --subset $subset --augment $augment || exit 1 done echo "$0: Preparing data..." - local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \ - --download_dir3 $download_dir3 --images_scp_dir data/local \ - --data_splits_dir $data_splits_dir --writing_condition1 $writing_condition1 \ - --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 + for set in dev train test; do + local/process_data.py $download_dir1 $download_dir2 $download_dir3 \ + $data_splits_dir/madcat.$set.raw.lineid data/$set $images_scp_dir/$set/images.scp \ + $writing_condition1 $writing_condition2 $writing_condition3 --augment $augment --subset $subset + image/fix_data_dir.sh data/${set} + done + fi if [ $stage -le 1 ]; then @@ -58,10 +63,10 @@ if [ $stage -le 1 ]; then image/get_image2num_frames.py data/train image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train - for dataset in test train; do - echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $dataset. $(date)" - local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset - steps/compute_cmvn_stats.sh data/$dataset || exit 1; + for set in test train; do + echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $set. $(date)" + local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$set + steps/compute_cmvn_stats.sh data/$set || exit 1; done echo "$0: Fixing data directory for train dataset $(date)." utils/fix_data_dir.sh data/train @@ -69,14 +74,14 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing BPE..." - cut -d' ' -f2- data/train/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + cut -d' ' -f2- data/train/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt for set in test train dev; do cut -d' ' -f1 data/$set/text > data/$set/ids - cut -d' ' -f2- data/$set/text | local/reverse.py | \ - utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \ + cut -d' ' -f2- data/$set/text | utils/lang/bpe/reverse.py | \ + utils/lang/bpe/prepend_words.py | \ utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ | sed 's/@@//g' > data/$set/bpe_text @@ -95,8 +100,10 @@ fi if [ $stage -le 3 ]; then echo "$0: Estimating a language model for decoding..." local/train_lm.sh - utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ - data/local/dict/lexicon.txt data/lang_test + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g fi if [ $stage -le 4 ]; then diff --git a/egs/madcat_ar/v1/local/reverse.py b/egs/wsj/s5/utils/lang/bpe/reverse.py similarity index 100% rename from egs/madcat_ar/v1/local/reverse.py rename to egs/wsj/s5/utils/lang/bpe/reverse.py diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh index 34e938db069..1c38bcb072d 100755 --- a/egs/yomdle_fa/v1/local/augment_data.sh +++ b/egs/yomdle_fa/v1/local/augment_data.sh @@ -9,6 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -27,7 +28,9 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr $fliplr --augment true $datadir/augmentations/$set + --vertical-shift $verticle_shift \ + --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set + done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_fa/v1/local/extract_features.sh b/egs/yomdle_fa/v1/local/extract_features.sh index 7d6806a2712..f75837ae5b3 100755 --- a/egs/yomdle_fa/v1/local/extract_features.sh +++ b/egs/yomdle_fa/v1/local/extract_features.sh @@ -6,7 +6,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false -augment=false +augment='no_aug' num_channels=3 echo "$0 $@" @@ -35,7 +35,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/yomdle_tamil/v1/local/augment_data.sh b/egs/yomdle_tamil/v1/local/augment_data.sh index 82fa5230a43..136bfd24eb2 100755 --- a/egs/yomdle_tamil/v1/local/augment_data.sh +++ b/egs/yomdle_tamil/v1/local/augment_data.sh @@ -8,6 +8,7 @@ nj=4 cmd=run.pl feat_dim=40 +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -26,7 +27,8 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr false --augment true $datadir/augmentations/$set + --vertical-shift $verticle_shift \ + --fliplr false --augment 'random_scale' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_tamil/v1/local/extract_features.sh b/egs/yomdle_tamil/v1/local/extract_features.sh index 4ed6ba04348..3880ebad3e8 100755 --- a/egs/yomdle_tamil/v1/local/extract_features.sh +++ b/egs/yomdle_tamil/v1/local/extract_features.sh @@ -9,7 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 -augment=false +augment='no_aug' fliplr=false echo "$0 $@" @@ -38,7 +38,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh index 34e938db069..1f13ed15ded 100755 --- a/egs/yomdle_zh/v1/local/augment_data.sh +++ b/egs/yomdle_zh/v1/local/augment_data.sh @@ -9,6 +9,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false +verticle_shift=0 echo "$0 $@" . ./cmd.sh @@ -27,7 +28,8 @@ for set in aug1; do $srcdir $datadir/augmentations/$set cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \ - --fliplr $fliplr --augment true $datadir/augmentations/$set + --vertical-shift $verticle_shift \ + --fliplr $fliplr --augment 'random_scale' $datadir/augmentations/$set done echo " combine original data and data from different augmentations" diff --git a/egs/yomdle_zh/v1/local/extract_features.sh b/egs/yomdle_zh/v1/local/extract_features.sh index 7d6806a2712..f75837ae5b3 100755 --- a/egs/yomdle_zh/v1/local/extract_features.sh +++ b/egs/yomdle_zh/v1/local/extract_features.sh @@ -6,7 +6,7 @@ nj=4 cmd=run.pl feat_dim=40 fliplr=false -augment=false +augment='no_aug' num_channels=3 echo "$0 $@" @@ -35,7 +35,7 @@ utils/split_scp.pl $scp $split_scps || exit 1; $cmd JOB=1:$nj $logdir/extract_features.JOB.log \ image/ocr/make_features.py $logdir/images.JOB.scp \ --allowed_len_file_path $data/allowed_lengths.txt \ - --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \ + --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment_type $augment \| \ copy-feats --compress=true --compression-method=7 \ ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp