From 52c6721da08493c0f6d5b07e3bb690a18a96365b Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 18 Mar 2018 22:21:35 -0400 Subject: [PATCH 1/9] Add initial scripts for e2e ocr - not cleaned --- egs/cifar/v1/image/get_allowed_lengths.py | 312 +++++++++++++++++++++ egs/cifar/v1/image/get_image2num_frames.py | 64 +++++ egs/iam/v1/run_end2end.sh | 76 +++++ 3 files changed, 452 insertions(+) create mode 100755 egs/cifar/v1/image/get_allowed_lengths.py create mode 100755 egs/cifar/v1/image/get_image2num_frames.py create mode 100755 egs/iam/v1/run_end2end.sh diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py new file mode 100755 index 00000000000..07db16e2238 --- /dev/null +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python + +# Copyright 2017 Hossein Hadian +# Apache 2.0 + + +""" This script perturbs speeds of utterances to force their lengths to some + allowed lengths spaced by a factor (like 10%) +""" + +import argparse +import os +import sys +import copy +import math +import logging + +sys.path.insert(0, 'steps') +import libs.common as common_lib + +logger = logging.getLogger('libs') +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - " + "%(funcName)s - %(levelname)s ] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + +def get_args(): + parser = argparse.ArgumentParser(description="""This script copies the 'srcdir' + data directory to output data directory 'dir' + while modifying the utterances so that there are + 3 copies of each utterance: one with the same + speed, one with a higher speed (not more than + factor% faster) and one with a lower speed + (not more than factor% slower)""") + parser.add_argument('factor', type=float, default=12, + help='Spacing (in percentage) between allowed lengths.') + parser.add_argument('srcdir', type=str, + help='path to source data dir') + parser.add_argument('--coverage-factor', type=float, default=0.05, + help="""Percentage of durations not covered from each + side of duration histogram.""") + parser.add_argument('--frame-subsampling-factor', type=int, default=3, + help="""Chain frame subsampling factor. + See steps/nnet3/chain/train.py""") + + args = parser.parse_args() + return args + +class Utterance: + """ This class represents a Kaldi utterance + in a data directory like data/train + """ + + def __init__(self, uid, wavefile, speaker, transcription, dur): + self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else + 'cat {} |'.format(wavefile)) + self.speaker = speaker + self.transcription = transcription + self.id = uid + self.dur = float(dur) + + def to_kaldi_utt_str(self): + return self.id + " " + self.transcription + + def to_kaldi_wave_str(self): + return self.id + " " + self.wavefile + + def to_kaldi_dur_str(self): + return "{} {:0.3f}".format(self.id, self.dur) + + +def read_kaldi_datadir(dir): + """ Read a data directory like + data/train as a list of utterances + """ + + # check to make sure that no segments file exists as this script won't work + # with data directories which use a segments file. + if os.path.isfile(os.path.join(dir, 'segments')): + logger.info("The data directory '{}' seems to use a 'segments' file. " + "This script does not yet support a 'segments' file. You'll need " + "to use utils/data/extract_wav_segments_data_dir.sh " + "to convert the data dir so it does not use a 'segments' file. " + "Exiting...".format(dir)) + sys.exit(1) + + logger.info("Loading the data from {}...".format(dir)) + utterances = [] + wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp')) + text = read_kaldi_mapfile(os.path.join(dir, 'text')) + utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur')) + utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk')) + + num_fail = 0 + for utt in wav_scp: + if utt in text and utt in utt2dur and utt in utt2spk: + utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt], + text[utt], utt2dur[utt])) + else: + num_fail += 1 + + if len(utterances) / len(wav_scp) < 0.5: + logger.info("More than half your data is problematic. Try " + "fixing using fix_data_dir.sh.") + sys.exit(1) + + logger.info("Successfully read {} utterances. Failed for {} " + "utterances.".format(len(utterances), num_fail)) + return utterances + + +def read_kaldi_mapfile(path): + """ Read any Kaldi mapping file - like text, .scp files, etc. + """ + + m = {} + with open(path, 'r') as f: + for line in f: + line = line.strip() + sp_pos = line.find(' ') + key = line[:sp_pos] + val = line[sp_pos+1:] + m[key] = val + return m + +def generate_kaldi_data_files(utterances, outdir): + """ Write out a list of utterances as Kaldi data files into an + output data directory. + """ + + logger.info("Exporting to {}...".format(outdir)) + speakers = {} + + with open(os.path.join(outdir, 'text'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_utt_str() + "\n") + + with open(os.path.join(outdir, 'wav.scp'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_wave_str() + "\n") + + with open(os.path.join(outdir, 'utt2dur'), 'w') as f: + for utt in utterances: + f.write(utt.to_kaldi_dur_str() + "\n") + + with open(os.path.join(outdir, 'utt2spk'), 'w') as f: + for utt in utterances: + f.write(utt.id + " " + utt.speaker + "\n") + if utt.speaker not in speakers: + speakers[utt.speaker] = [utt.id] + else: + speakers[utt.speaker].append(utt.id) + + with open(os.path.join(outdir, 'spk2utt'), 'w') as f: + for s in speakers: + f.write(s + " ") + for utt in speakers[s]: + f.write(utt + " ") + f.write('\n') + + logger.info("Successfully wrote {} utterances to data " + "directory '{}'".format(len(utterances), outdir)) + +def find_duration_range(img2len, coverage_factor): + """Given a list of utterances, find the start and end duration to cover + + If we try to cover + all durations which occur in the training set, the number of + allowed lengths could become very large. + + Returns + ------- + start_dur: int + end_dur: int + """ + durs = [] + for im, imlen in img2len.items(): + durs.append(int(imlen)) + durs.sort() + to_ignore_dur = 0 + tot_dur = sum(durs) + for d in durs: + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + start_dur = d + break + to_ignore_dur = 0 + for d in reversed(durs): + to_ignore_dur += d + if to_ignore_dur * 100.0 / tot_dur > coverage_factor: + end_dur = d + break + if start_dur < 30: + start_dur = 30 # a hard limit to avoid too many allowed lengths --not critical + return start_dur, end_dur + + +def find_allowed_durations(start_len, end_len, args): + """Given the start and end duration, find a set of + allowed durations spaced by args.factor%. Also write + out the list of allowed durations and the corresponding + allowed lengths (in frames) on disk. + + Returns + ------- + allowed_durations: list of allowed durations (in seconds) + """ + + allowed_lengths = [] + length = start_len + with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'wb') as fp: + while length < end_len: + if length % args.frame_subsampling_factor != 0: + length = (args.frame_subsampling_factor * + (length // args.frame_subsampling_factor)) + allowed_lengths.append(length) + fp.write("{}\n".format(int(length))) + length *= args.factor + return allowed_lengths + + + +def perturb_utterances(utterances, allowed_durations, args): + """Given a set of utterances and a set of allowed durations, generate + an extended set of perturbed utterances (all having an allowed duration) + + Returns + ------- + perturbed_utterances: list of pertubed utterances + """ + + perturbed_utterances = [] + for u in utterances: + # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i] + # i = len(allowed_durations) --> no upper bound + # i = 0 --> no lower bound + if u.dur < allowed_durations[0]: + i = 0 + elif u.dur > allowed_durations[-1]: + i = len(allowed_durations) + else: + i = 1 + while i < len(allowed_durations): + if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]: + break + i += 1 + + if i > 0 and args.speed_perturb: # we have a smaller allowed duration + allowed_dur = allowed_durations[i - 1] + speed = u.dur / allowed_dur + if max(speed, 1.0/speed) > args.factor: # this could happen for very short/long utterances + continue + u1 = copy.deepcopy(u) + u1.id = 'pv1-' + u.id + u1.speaker = 'pv1-' + u.speaker + u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u1.dur = allowed_dur + perturbed_utterances.append(u1) + + + if i < len(allowed_durations): # we have a larger allowed duration + allowed_dur2 = allowed_durations[i] + speed = u.dur / allowed_dur2 + if max(speed, 1.0/speed) > args.factor: + continue + + ## Add two versions for the second allowed_duration + ## one version is by using speed modification using sox + ## the other is by extending by silence + if args.speed_perturb: + u2 = copy.deepcopy(u) + u2.id = 'pv2-' + u.id + u2.speaker = 'pv2-' + u.speaker + u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) + u2.dur = allowed_dur2 + perturbed_utterances.append(u2) + + delta = allowed_dur2 - u.dur + if delta <= 1e-4: + continue + u3 = copy.deepcopy(u) + u3.id = 'pv3-' + u.id + u3.speaker = 'pv3-' + u.speaker + u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta) + u3.dur = allowed_dur2 + perturbed_utterances.append(u3) + return perturbed_utterances + + + +def main(): + args = get_args() + args.factor = 1.0 + args.factor / 100.0 + + image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames.txt')) + + start_dur, end_dur = find_duration_range(image2length, args.coverage_factor) + logger.info("Lengths in the range [{},{}] will be covered. " + "Coverage rate: {}%".format(start_dur, end_dur, + 100.0 - args.coverage_factor * 2)) + logger.info("There will be {} unique allowed lengths " + "for the images.".format(int(math.log(end_dur / start_dur) / + math.log(args.factor)))) + + allowed_durations = find_allowed_durations(start_dur, end_dur, args) + + +if __name__ == '__main__': + main() diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py new file mode 100755 index 00000000000..eb4d0120658 --- /dev/null +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default=None, + help='Where to write the output image-to-num_frames info.') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') +args = parser.parse_args() + + +def get_scaled_image_length(im): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale * sx) + return nx + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if not args.out_ark: + args.out_ark = os.path.join(args.dir,'image2num_frames.txt') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark, 'w', encoding='latin-1') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_len = get_scaled_image_length(im) + (args.padding * 2) + print('{} {}'.format(image_id, im_len), file=out_fh) + +out_fh.close() diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh new file mode 100755 index 00000000000..2278ab99235 --- /dev/null +++ b/egs/iam/v1/run_end2end.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + data/local/dict "" data/lang/temp data/lang +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang_test +fi + + +if [ $stage -le 4 ]; then + echo "$0: estimating phone language model for the denominator graph" + mkdir -p exp/chain/e2e_base/log + $train_cmd exp/chain/e2e_base/log/make_phone_lm.log \ + cat data/train/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=2000 \ + ark:- exp/chain/e2e_base/phone_lm.fst +fi + +if [ $stage -le 5 ]; then + echo "$0: calling the flat-start chain recipe..." + local/chain/e2e/run_tdnn_flatstart.sh +fi From 92a5866f4a108efc03402de15da3b87cbc2e6a95 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 18 Mar 2018 23:23:53 -0400 Subject: [PATCH 2/9] Add e2e chain script --- egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 153 ++++++++++++++++++ egs/iam/v1/run_end2end.sh | 4 +- 2 files changed, 155 insertions(+), 2 deletions(-) create mode 100755 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh new file mode 100755 index 00000000000..3dd77a03522 --- /dev/null +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1500000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train_e2e + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + cp exp/chain/e2e_base/phone_lm.fst $treedir/ +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 2278ab99235..1b606d0347c 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -62,11 +62,11 @@ fi if [ $stage -le 4 ]; then echo "$0: estimating phone language model for the denominator graph" mkdir -p exp/chain/e2e_base/log - $train_cmd exp/chain/e2e_base/log/make_phone_lm.log \ + $cmd exp/chain/e2e_base/log/make_phone_lm.log \ cat data/train/text \| \ steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ utils/sym2int.pl -f 2- data/lang/phones.txt \| \ - chain-est-phone-lm --num-extra-lm-states=2000 \ + chain-est-phone-lm --num-extra-lm-states=1000 \ ark:- exp/chain/e2e_base/phone_lm.fst fi From ea839ade679703a9fe52b1b636fcfe8a1c1ec1bb Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 19 Mar 2018 10:22:46 -0400 Subject: [PATCH 3/9] Some fixes --- egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 16 +++---- egs/iam/v1/local/make_features.py | 43 ++++++++++++++++--- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh index 3dd77a03522..d7c39c78f21 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -8,19 +8,20 @@ set -e stage=0 train_stage=-10 get_egs_stage=-10 -affix=1a +affix=1afix1 # training options tdnn_dim=450 num_epochs=4 num_jobs_initial=2 num_jobs_final=4 -minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 common_egs_dir= l2_regularize=0.00005 -frames_per_iter=1500000 +frames_per_iter=1000000 cmvn_opts="--norm-means=true --norm-vars=true" train_set=train_e2e +lang_test=lang_test # End configuration section. echo "$0 $@" # Print the command line for logging @@ -108,6 +109,8 @@ if [ $stage -le 3 ]; then --egs.dir "$common_egs_dir" \ --egs.stage $get_egs_stage \ --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ --trainer.num-chunk-per-minibatch $minibatch_size \ --trainer.frames-per-iter $frames_per_iter \ --trainer.num-epochs $num_epochs \ @@ -140,12 +143,7 @@ fi if [ $stage -le 5 ]; then frames_per_chunk=$(echo $chunk_width | cut -d, -f1) steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ - --extra-left-context $chunk_left_context \ - --extra-right-context $chunk_right_context \ - --extra-left-context-initial 0 \ - --extra-right-context-final 0 \ - --frames-per-chunk $frames_per_chunk \ - --nj $nj --cmd "$cmd" \ + --nj 30 --cmd "$cmd" \ $dir/graph data/test $dir/decode_test || exit 1; fi diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index b998464953f..08a6f96b940 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -30,6 +30,10 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') +#parser.add_argument('--img2len-file', type=str, default=None, +# help='If supplied, each images will be padded to reach the ' +# 'target length (this overrides --padding).') + args = parser.parse_args() @@ -49,7 +53,7 @@ def write_kaldi_matrix(file_handle, matrix, key): file_handle.write("\n") file_handle.write(" ]\n") -def get_scaled_image(im): +def get_scaled_image(im, allowed_lengths = None): scale_size = args.feat_dim sx = im.shape[1] sy = im.shape[0] @@ -57,11 +61,24 @@ def get_scaled_image(im): nx = int(scale_size) ny = int(scale * sx) im = misc.imresize(im, (nx, ny)) - padding_x = args.padding - padding_y = im.shape[0] - im_pad = np.concatenate((255 * np.ones((padding_y, padding_x), + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: + imlen = im.shape[1] + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + return None + padding = allowed_len - imlen + left_padding = padding // 2 + right_padding = padding - left_padding + dim_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x), + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), dtype=int)), axis=1) return im_pad1 @@ -73,6 +90,15 @@ def get_scaled_image(im): else: out_fh = open(args.out_ark,'wb') +allowed_lengths = None +if os.path.isfile(os.path.join(args.dir,'allowed_lengths.txt')): + print("Found 'allowed-lengths' file...", file=sys.stderr) + allowed_lengths = [] + with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the features.".format(len(allowed_lengths)), file=sys.stderr) + with open(data_list_path) as f: for line in f: line = line.strip() @@ -80,8 +106,11 @@ def get_scaled_image(im): image_id = line_vect[0] image_path = line_vect[1] im = misc.imread(image_path) - im_scale = get_scaled_image(im) + im_scaled = get_scaled_image(im, allowed_lengths) - data = np.transpose(im_scale, (1, 0)) + if im_scaled is None: + print('Image scaling failed: {} '.format(image_id), file=sys.stderr) + continue + data = np.transpose(im_scaled, (1, 0)) data = np.divide(data, 255.0) write_kaldi_matrix(out_fh, data, image_id) From f5cbb24277a2c02c49260b6c4476e8e2945826ec Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 19 Mar 2018 14:52:47 -0400 Subject: [PATCH 4/9] Some cleaning --- egs/cifar/v1/image/get_allowed_lengths.py | 168 ---------------------- egs/iam/v1/run_end2end.sh | 4 +- 2 files changed, 2 insertions(+), 170 deletions(-) diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 07db16e2238..668d4cb493d 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -49,68 +49,6 @@ def get_args(): args = parser.parse_args() return args -class Utterance: - """ This class represents a Kaldi utterance - in a data directory like data/train - """ - - def __init__(self, uid, wavefile, speaker, transcription, dur): - self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else - 'cat {} |'.format(wavefile)) - self.speaker = speaker - self.transcription = transcription - self.id = uid - self.dur = float(dur) - - def to_kaldi_utt_str(self): - return self.id + " " + self.transcription - - def to_kaldi_wave_str(self): - return self.id + " " + self.wavefile - - def to_kaldi_dur_str(self): - return "{} {:0.3f}".format(self.id, self.dur) - - -def read_kaldi_datadir(dir): - """ Read a data directory like - data/train as a list of utterances - """ - - # check to make sure that no segments file exists as this script won't work - # with data directories which use a segments file. - if os.path.isfile(os.path.join(dir, 'segments')): - logger.info("The data directory '{}' seems to use a 'segments' file. " - "This script does not yet support a 'segments' file. You'll need " - "to use utils/data/extract_wav_segments_data_dir.sh " - "to convert the data dir so it does not use a 'segments' file. " - "Exiting...".format(dir)) - sys.exit(1) - - logger.info("Loading the data from {}...".format(dir)) - utterances = [] - wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp')) - text = read_kaldi_mapfile(os.path.join(dir, 'text')) - utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur')) - utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk')) - - num_fail = 0 - for utt in wav_scp: - if utt in text and utt in utt2dur and utt in utt2spk: - utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt], - text[utt], utt2dur[utt])) - else: - num_fail += 1 - - if len(utterances) / len(wav_scp) < 0.5: - logger.info("More than half your data is problematic. Try " - "fixing using fix_data_dir.sh.") - sys.exit(1) - - logger.info("Successfully read {} utterances. Failed for {} " - "utterances.".format(len(utterances), num_fail)) - return utterances - def read_kaldi_mapfile(path): """ Read any Kaldi mapping file - like text, .scp files, etc. @@ -126,44 +64,6 @@ def read_kaldi_mapfile(path): m[key] = val return m -def generate_kaldi_data_files(utterances, outdir): - """ Write out a list of utterances as Kaldi data files into an - output data directory. - """ - - logger.info("Exporting to {}...".format(outdir)) - speakers = {} - - with open(os.path.join(outdir, 'text'), 'w') as f: - for utt in utterances: - f.write(utt.to_kaldi_utt_str() + "\n") - - with open(os.path.join(outdir, 'wav.scp'), 'w') as f: - for utt in utterances: - f.write(utt.to_kaldi_wave_str() + "\n") - - with open(os.path.join(outdir, 'utt2dur'), 'w') as f: - for utt in utterances: - f.write(utt.to_kaldi_dur_str() + "\n") - - with open(os.path.join(outdir, 'utt2spk'), 'w') as f: - for utt in utterances: - f.write(utt.id + " " + utt.speaker + "\n") - if utt.speaker not in speakers: - speakers[utt.speaker] = [utt.id] - else: - speakers[utt.speaker].append(utt.id) - - with open(os.path.join(outdir, 'spk2utt'), 'w') as f: - for s in speakers: - f.write(s + " ") - for utt in speakers[s]: - f.write(utt + " ") - f.write('\n') - - logger.info("Successfully wrote {} utterances to data " - "directory '{}'".format(len(utterances), outdir)) - def find_duration_range(img2len, coverage_factor): """Given a list of utterances, find the start and end duration to cover @@ -223,74 +123,6 @@ def find_allowed_durations(start_len, end_len, args): -def perturb_utterances(utterances, allowed_durations, args): - """Given a set of utterances and a set of allowed durations, generate - an extended set of perturbed utterances (all having an allowed duration) - - Returns - ------- - perturbed_utterances: list of pertubed utterances - """ - - perturbed_utterances = [] - for u in utterances: - # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i] - # i = len(allowed_durations) --> no upper bound - # i = 0 --> no lower bound - if u.dur < allowed_durations[0]: - i = 0 - elif u.dur > allowed_durations[-1]: - i = len(allowed_durations) - else: - i = 1 - while i < len(allowed_durations): - if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]: - break - i += 1 - - if i > 0 and args.speed_perturb: # we have a smaller allowed duration - allowed_dur = allowed_durations[i - 1] - speed = u.dur / allowed_dur - if max(speed, 1.0/speed) > args.factor: # this could happen for very short/long utterances - continue - u1 = copy.deepcopy(u) - u1.id = 'pv1-' + u.id - u1.speaker = 'pv1-' + u.speaker - u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) - u1.dur = allowed_dur - perturbed_utterances.append(u1) - - - if i < len(allowed_durations): # we have a larger allowed duration - allowed_dur2 = allowed_durations[i] - speed = u.dur / allowed_dur2 - if max(speed, 1.0/speed) > args.factor: - continue - - ## Add two versions for the second allowed_duration - ## one version is by using speed modification using sox - ## the other is by extending by silence - if args.speed_perturb: - u2 = copy.deepcopy(u) - u2.id = 'pv2-' + u.id - u2.speaker = 'pv2-' + u.speaker - u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed) - u2.dur = allowed_dur2 - perturbed_utterances.append(u2) - - delta = allowed_dur2 - u.dur - if delta <= 1e-4: - continue - u3 = copy.deepcopy(u) - u3.id = 'pv3-' + u.id - u3.speaker = 'pv3-' + u.speaker - u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta) - u3.dur = allowed_dur2 - perturbed_utterances.append(u3) - return perturbed_utterances - - - def main(): args = get_args() args.factor = 1.0 + args.factor / 100.0 diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 1b606d0347c..d479bfa2a73 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -47,7 +47,7 @@ fi if [ $stage -le 2 ]; then echo "$0: Preparing dictionary and lang..." local/prepare_dict.sh - utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \ + utils/prepare_lang.sh --sil-prob 0.95 \ data/local/dict "" data/lang/temp data/lang fi @@ -72,5 +72,5 @@ fi if [ $stage -le 5 ]; then echo "$0: calling the flat-start chain recipe..." - local/chain/e2e/run_tdnn_flatstart.sh + local/chain/run_flatstart_cnn1a.sh fi From 871625b3f731be47bf920658bbe412137b9ef391 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 26 Mar 2018 15:12:34 -0400 Subject: [PATCH 5/9] Fix the docs. --- egs/cifar/v1/image/get_allowed_lengths.py | 17 ++++++++--------- egs/cifar/v1/image/get_image2num_frames.py | 22 ++++++++++------------ 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 668d4cb493d..2e7996c4d77 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -4,8 +4,11 @@ # Apache 2.0 -""" This script perturbs speeds of utterances to force their lengths to some - allowed lengths spaced by a factor (like 10%) +""" This script finds a set of allowed lengths for a given OCR/HWR data dir. + The allowed lengths are spaced by a factor (like 10%) and are written + in an output file named "allowed_lengths.txt" in the output data dir. This + file is later used by make_features.py to pad each image sufficiently so that + they all have an allowed length. This is intended for end2end chain training. """ import argparse @@ -28,13 +31,9 @@ logger.addHandler(handler) def get_args(): - parser = argparse.ArgumentParser(description="""This script copies the 'srcdir' - data directory to output data directory 'dir' - while modifying the utterances so that there are - 3 copies of each utterance: one with the same - speed, one with a higher speed (not more than - factor% faster) and one with a lower speed - (not more than factor% slower)""") + parser = argparse.ArgumentParser(description="""This script finds a set of + allowed lengths for a given OCR/HWR data dir. + Intended for chain training.""" parser.add_argument('factor', type=float, default=12, help='Spacing (in percentage) between allowed lengths.') parser.add_argument('srcdir', type=str, diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py index eb4d0120658..91a7e34dae5 100755 --- a/egs/cifar/v1/image/get_image2num_frames.py +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -1,16 +1,13 @@ #!/usr/bin/env python3 -# Copyright 2017 Chun Chieh Chang -# 2017 Ashish Arora +# Copyright 2018 Hossein Hadian -""" This script converts images to Kaldi-format feature matrices. The input to - this script is the path to a data directory, e.g. "data/train". This script - reads the images listed in images.scp and writes them to standard output - (by default) as Kaldi-formatted matrices (in text form). It also scales the - images so they have the same height (via --feat-dim). It can optionally pad - the images (on left/right sides) with white pixels. - eg. local/make_features.py data/train --feat-dim 40 +""" This script computes the image lengths (with padding) in an image data dir. + The output is written to 'image2num_frames.txt' in the given data dir. This + file is later used by image/get_allowed_lengths.py to find a set of allowed lengths + for the data dir. The output format is similar to utt2num_frames + """ import argparse @@ -19,12 +16,13 @@ import numpy as np from scipy import misc -parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and - writes them to standard output in text format.""") +parser = argparse.ArgumentParser(description="""Computes the image lengths (with padding) in an image data dir + and writes them (by default) to image2num_frames.txt.""") parser.add_argument('dir', type=str, help='Source data directory (containing images.scp)') parser.add_argument('--out-ark', type=str, default=None, - help='Where to write the output image-to-num_frames info.') + help='Where to write the output image-to-num_frames info. ' + 'Default: dir/image2num_frames.txt') parser.add_argument('--feat-dim', type=int, default=40, help='Size to scale the height of all images') parser.add_argument('--padding', type=int, default=5, From 95958ce8ab53222df0c58441be3770a3775b5cca Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 26 Mar 2018 15:16:22 -0400 Subject: [PATCH 6/9] Add more docs --- egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh index d7c39c78f21..1f2d1755fe2 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -1,6 +1,7 @@ #!/bin/bash # Copyright 2017 Hossein Hadian +# This script does end2end chain training (i.e. from scratch) set -e @@ -8,7 +9,7 @@ set -e stage=0 train_stage=-10 get_egs_stage=-10 -affix=1afix1 +affix=1a # training options tdnn_dim=450 From f781feed8f087e03efc99dad40f4936b23b4f83e Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Mon, 26 Mar 2018 19:05:59 -0400 Subject: [PATCH 7/9] Some more cleaning + results --- egs/cifar/v1/image/get_image2num_frames.py | 2 +- egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 13 +++++++++ egs/iam/v1/local/make_features.py | 29 +++++++++++++------ 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py index 91a7e34dae5..5ebd5d15a9f 100755 --- a/egs/cifar/v1/image/get_image2num_frames.py +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -16,7 +16,7 @@ import numpy as np from scipy import misc -parser = argparse.ArgumentParser(description="""Computes the image lengths (with padding) in an image data dir +parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir and writes them (by default) to image2num_frames.txt.""") parser.add_argument('dir', type=str, help='Source data directory (containing images.scp)') diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh index 1f2d1755fe2..65eeedcc75b 100755 --- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh +++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh @@ -3,6 +3,19 @@ # This script does end2end chain training (i.e. from scratch) +# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a +# System cnn_1a cnn_chainali_1c e2e_cnn_1a +# WER 18.58 12.84 15.46 +# CER 10.17 6.40 7.21 +# Final train prob -0.0122 -0.0120 -0.0426 +# Final valid prob -0.0999 -0.0199 -0.0724 +# Final train prob (xent) -0.5652 -0.9973 +# Final valid prob (xent) -0.9758 -1.1537 +# Parameters 4.36M 3.96M 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/ +# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072) + set -e # configs for 'chain' diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py index 08a6f96b940..8cfca5ee830 100755 --- a/egs/iam/v1/local/make_features.py +++ b/egs/iam/v1/local/make_features.py @@ -2,6 +2,7 @@ # Copyright 2017 Chun Chieh Chang # 2017 Ashish Arora +# 2018 Hossein Hadian """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script @@ -9,6 +10,10 @@ (by default) as Kaldi-formatted matrices (in text form). It also scales the images so they have the same height (via --feat-dim). It can optionally pad the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. eg. local/make_features.py data/train --feat-dim 40 """ @@ -30,9 +35,7 @@ parser.add_argument('--padding', type=int, default=5, help='Number of white pixels to pad on the left' 'and right side of the image.') -#parser.add_argument('--img2len-file', type=str, default=None, -# help='If supplied, each images will be padded to reach the ' -# 'target length (this overrides --padding).') + args = parser.parse_args() @@ -63,7 +66,7 @@ def get_scaled_image(im, allowed_lengths = None): im = misc.imresize(im, (nx, ny)) if allowed_lengths is None: left_padding = right_padding = args.padding - else: + else: # Find an allowed length for the image imlen = im.shape[1] allowed_len = 0 for l in allowed_lengths: @@ -71,6 +74,7 @@ def get_scaled_image(im, allowed_lengths = None): allowed_len = l break if allowed_len == 0: + # No allowed length was found for the image (the image is too long) return None padding = allowed_len - imlen left_padding = padding // 2 @@ -83,7 +87,7 @@ def get_scaled_image(im, allowed_lengths = None): return im_pad1 ### main ### -data_list_path = os.path.join(args.dir,'images.scp') +data_list_path = os.path.join(args.dir, 'images.scp') if args.out_ark == '-': out_fh = sys.stdout @@ -91,14 +95,17 @@ def get_scaled_image(im, allowed_lengths = None): out_fh = open(args.out_ark,'wb') allowed_lengths = None -if os.path.isfile(os.path.join(args.dir,'allowed_lengths.txt')): - print("Found 'allowed-lengths' file...", file=sys.stderr) +if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) allowed_lengths = [] with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: for line in f: allowed_lengths.append(int(line.strip())) - print("Read {} allowed lengths and will apply them to the features.".format(len(allowed_lengths)), file=sys.stderr) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) +num_fail = 0 +num_ok = 0 with open(data_list_path) as f: for line in f: line = line.strip() @@ -109,8 +116,12 @@ def get_scaled_image(im, allowed_lengths = None): im_scaled = get_scaled_image(im, allowed_lengths) if im_scaled is None: - print('Image scaling failed: {} '.format(image_id), file=sys.stderr) + num_fail += 1 continue data = np.transpose(im_scaled, (1, 0)) data = np.divide(data, 255.0) + num_ok += 1 write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (iamge too ' + 'long).'.format(num_ok, num_fail)) From 861ecbaa1abf38cfe3ce2e72cc06f9b9f6df1f63 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 27 Mar 2018 16:18:23 -0400 Subject: [PATCH 8/9] Add new chain recipe with l2-regularize (by @aarora8) --- egs/iam/v1/local/chain/run_cnn_chainali_1c.sh | 246 ++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh new file mode 100755 index 00000000000..6ff76490303 --- /dev/null +++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh @@ -0,0 +1,246 @@ +#!/bin/bash + +# chainali_1c is as chainali_1b except it uses l2-regularize +# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c +# System cnn_chainali_1b cnn_chainali_1c +# WER 14.67 12.84 +# CER 7.31 6.40 +# Final train prob 0.0042 -0.0120 +# Final valid prob -0.0256 -0.0199 +# Final train prob (xent) -0.6282 -0.9973 +# Final valid prob (xent) -0.9096 -1.1537 +# Parameters 3.96M 3.96M + +# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c +# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + opts="l2-regularize=0.075" + opts_2="l2-regularize=0.075" + opts_3="l2-regularize=0.1" + common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi From 050bc1c25cc5e9b997b9440dfb157244412103db Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Tue, 27 Mar 2018 16:44:27 -0400 Subject: [PATCH 9/9] Update to python3 + rename image2num_frames.txt to image2num_frames (to be consistent with utt2num_frames) --- egs/cifar/v1/image/get_allowed_lengths.py | 10 +++++----- egs/cifar/v1/image/get_image2num_frames.py | 8 ++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 2e7996c4d77..02321fdd2df 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2017 Hossein Hadian # Apache 2.0 @@ -33,7 +33,7 @@ def get_args(): parser = argparse.ArgumentParser(description="""This script finds a set of allowed lengths for a given OCR/HWR data dir. - Intended for chain training.""" + Intended for chain training.""") parser.add_argument('factor', type=float, default=12, help='Spacing (in percentage) between allowed lengths.') parser.add_argument('srcdir', type=str, @@ -54,7 +54,7 @@ def read_kaldi_mapfile(path): """ m = {} - with open(path, 'r') as f: + with open(path, 'r', encoding='latin-1') as f: for line in f: line = line.strip() sp_pos = line.find(' ') @@ -110,7 +110,7 @@ def find_allowed_durations(start_len, end_len, args): allowed_lengths = [] length = start_len - with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'wb') as fp: + with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp: while length < end_len: if length % args.frame_subsampling_factor != 0: length = (args.frame_subsampling_factor * @@ -126,7 +126,7 @@ def main(): args = get_args() args.factor = 1.0 + args.factor / 100.0 - image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames.txt')) + image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames')) start_dur, end_dur = find_duration_range(image2length, args.coverage_factor) logger.info("Lengths in the range [{},{}] will be covered. " diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py index 5ebd5d15a9f..3c003bb9947 100755 --- a/egs/cifar/v1/image/get_image2num_frames.py +++ b/egs/cifar/v1/image/get_image2num_frames.py @@ -4,7 +4,7 @@ """ This script computes the image lengths (with padding) in an image data dir. - The output is written to 'image2num_frames.txt' in the given data dir. This + The output is written to 'image2num_frames' in the given data dir. This file is later used by image/get_allowed_lengths.py to find a set of allowed lengths for the data dir. The output format is similar to utt2num_frames @@ -17,12 +17,12 @@ from scipy import misc parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir - and writes them (by default) to image2num_frames.txt.""") + and writes them (by default) to image2num_frames.""") parser.add_argument('dir', type=str, help='Source data directory (containing images.scp)') parser.add_argument('--out-ark', type=str, default=None, help='Where to write the output image-to-num_frames info. ' - 'Default: dir/image2num_frames.txt') + 'Default: "dir"/image2num_frames') parser.add_argument('--feat-dim', type=int, default=40, help='Size to scale the height of all images') parser.add_argument('--padding', type=int, default=5, @@ -43,7 +43,7 @@ def get_scaled_image_length(im): data_list_path = os.path.join(args.dir,'images.scp') if not args.out_ark: - args.out_ark = os.path.join(args.dir,'image2num_frames.txt') + args.out_ark = os.path.join(args.dir,'image2num_frames') if args.out_ark == '-': out_fh = sys.stdout else: