diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py new file mode 100755 index 00000000000..02321fdd2df --- /dev/null +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Hossein Hadian +# Apache 2.0 + + +""" This script finds a set of allowed lengths for a given OCR/HWR data dir. + The allowed lengths are spaced by a factor (like 10%) and are written + in an output file named "allowed_lengths.txt" in the output data dir. This + file is later used by make_features.py to pad each image sufficiently so that + they all have an allowed length. This is intended for end2end chain training. +""" + +import argparse +import os +import sys +import copy +import math +import logging + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +def get_args(): + parser = argparse.ArgumentParser(description="""This script finds a set of + allowed lengths for a given OCR/HWR data dir. + Intended for chain training.""") + parser.add_argument('factor', type=float, default=12, + help='Spacing (in percentage) between allowed lengths.') + parser.add_argument('srcdir', type=str, + help='path to source data dir') + parser.add_argument('--coverage-factor', type=float, default=0.05, + help="""Percentage of durations not covered from each + side of duration histogram.""") + parser.add_argument('--frame-subsampling-factor', type=int, default=3, + help="""Chain frame subsampling factor. + See steps/nnet3/chain/train.py""") + + args = parser.parse_args() + return args + + +def read_kaldi_mapfile(path): + """ Read any Kaldi mapping file - like text, .scp files, etc. + """ + + m = {} + with open(path, 'r', encoding='latin-1') as f: + for line in f: + line = line.strip() + sp_pos = line.find(' ') + key = line[:sp_pos] + val = line[sp_pos+1:] + m[key] = val + return m + +def find_duration_range(img2len, coverage_factor): + """Given a list of utterances, find the start and end duration to cover + + If we try to cover + all durations which occur in the training set, the number of + allowed lengths could become very large. + + Returns + ------- + start_dur: int + end_dur: int + """ + durs = [] + for im, imlen in img2len.items(): + durs.append(int(imlen)) + durs.sort() + to_ignore_dur = 0 + tot_dur = sum(durs) + for d in durs: + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + start_dur = d + break + to_ignore_dur = 0 + for d in reversed(durs): + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + end_dur = d + break + if start_dur < 30: + start_dur = 30 # a hard limit to avoid too many allowed lengths --not critical + return start_dur, end_dur + + +def find_allowed_durations(start_len, end_len, args): + """Given the start and end duration, find a set of + allowed durations spaced by args.factor%. Also write + out the list of allowed durations and the corresponding + allowed lengths (in frames) on disk. + + Returns + ------- + allowed_durations: list of allowed durations (in seconds) + """ + + allowed_lengths = [] + length = start_len + with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp: + while length < end_len: + if length % args.frame_subsampling_factor != 0: + length = (args.frame_subsampling_factor * + (length // args.frame_subsampling_factor)) + allowed_lengths.append(length) + fp.write("{}\n".format(int(length))) + length *= args.factor + return allowed_lengths + + + +def main(): + args = get_args() + args.factor = 1.0 + args.factor / 100.0 + + image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames')) + + start_dur, end_dur = find_duration_range(image2length, args.coverage_factor) + logger.info("Lengths in the range [{},{}] will be covered. " + "Coverage rate: {}%".format(start_dur, end_dur, + 100.0 - args.coverage_factor * 2)) + logger.info("There will be {} unique allowed lengths " + "for the images.".format(int(math.log(end_dur / start_dur) / + math.log(args.factor)))) + + allowed_durations = find_allowed_durations(start_dur, end_dur, args) + + +if __name__ == '__main__': + main() diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py new file mode 100755 index 00000000000..3c003bb9947 --- /dev/null +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +# Copyright 2018 Hossein Hadian + + +""" This script computes the image lengths (with padding) in an image data dir. + The output is written to 'image2num_frames' in the given data dir. This + file is later used by image/get_allowed_lengths.py to find a set of allowed lengths + for the data dir. The output format is similar to utt2num_frames + +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir + and writes them (by default) to image2num_frames.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default=None, + help='Where to write the output image-to-num_frames info. ' + 'Default: "dir"/image2num_frames') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +args = parser.parse_args() + + +def get_scaled_image_length(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale * sx) + return nx + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if not args.out_ark: + args.out_ark = os.path.join(args.dir,'image2num_frames') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark, 'w', encoding='latin-1') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_len = get_scaled_image_length(im) + (args.padding * 2) + print('{} {}'.format(image_id, im_len), file=out_fh) + +out_fh.close() diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh new file mode 100755 index 00000000000..6ff76490303 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# chainali_1c is as chainali_1b except it uses l2-regularize +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c +# System cnn_chainali_1b cnn_chainali_1c +# WER 14.67 12.84 +# CER 7.31 6.40 +# Final train prob 0.0042 -0.0120 +# Final valid prob -0.0256 -0.0199 +# Final train prob (xent) -0.6282 -0.9973 +# Final valid prob (xent) -0.9096 -1.1537 +# Parameters 3.96M 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c +# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..65eeedcc75b --- /dev/null +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,165 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a +# System cnn_1a cnn_chainali_1c e2e_cnn_1a +# WER 18.58 12.84 15.46 +# CER 10.17 6.40 7.21 +# Final train prob -0.0122 -0.0120 -0.0426 +# Final valid prob -0.0999 -0.0199 -0.0724 +# Final train prob (xent) -0.5652 -0.9973 +# Final valid prob (xent) -0.9758 -1.1537 +# Parameters 4.36M 3.96M 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train_e2e +lang_test=lang_test + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + cp exp/chain/e2e_base/phone_lm.fst $treedir/ +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index b998464953f..8cfca5ee830 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script @@ -9,6 +10,10 @@ (by default) as Kaldi-formatted matrices (in text form). It also scales the images so they have the same height (via --feat-dim). It can optionally pad the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. eg. local/make_features.py data/train --feat-dim 40 """ @@ -30,6 +35,8 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') + + args = parser.parse_args() @@ -49,7 +56,7 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im): +def get_scaled_image(im, allowed_lengths = None): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] @@ -57,22 +64,48 @@ def get_scaled_image(im): nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) - padding_x = args.padding - padding_y = im.shape[0] - im_pad = np.concatenate((255 * np.ones((padding_y, padding_x), + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = padding // 2 + right_padding = padding - left_padding + dim_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x), + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 ### main ### -data_list_path = os.path.join(args.dir,'images.scp') +data_list_path = os.path.join(args.dir, 'images.scp') if args.out_ark == '-': out_fh = sys.stdout else: out_fh = open(args.out_ark,'wb') +allowed_lengths = None +if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 with open(data_list_path) as f: for line in f: line = line.strip() @@ -80,8 +113,15 @@ def get_scaled_image(im): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scale = get_scaled_image(im) + im_scaled = get_scaled_image(im, allowed_lengths) - data = np.transpose(im_scale, (1, 0)) + if im_scaled is None: + num_fail += 1 + continue + data = np.transpose(im_scaled, (1, 0)) data = np.divide(data, 255.0) + num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (iamge too ' + 'long).'.format(num_ok, num_fail)) diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh new file mode 100755 index 00000000000..d479bfa2a73 --- /dev/null +++ b/egs/iam/v1/run_end2end.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + + +if [ $stage -le 4 ]; then + echo "$0: estimating phone language model for the denominator graph" + mkdir -p exp/chain/e2e_base/log + $cmd exp/chain/e2e_base/log/make_phone_lm.log \ + cat data/train/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=1000 \ + ark:- exp/chain/e2e_base/phone_lm.fst +fi + +if [ $stage -le 5 ]; then + echo "$0: calling the flat-start chain recipe..." + local/chain/run_flatstart_cnn1a.sh +fi