diff --git a/egs/iam/s5/cmd.sh b/egs/iam/s5/cmd.sh new file mode 100644 index 00000000000..466a51def1b --- /dev/null +++ b/egs/iam/s5/cmd.sh @@ -0,0 +1,15 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" diff --git a/egs/iam/s5/image b/egs/iam/s5/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/s5/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/s5/local/augment_and_make_feature_vect.py b/egs/iam/s5/local/augment_and_make_feature_vect.py new file mode 100755 index 00000000000..b1c179d71ed --- /dev/null +++ b/egs/iam/s5/local/augment_and_make_feature_vect.py @@ -0,0 +1,288 @@ +#!/usr/bin/env python +import random +import argparse +import os +import sys +import scipy.io as sio +import numpy as np +from scipy import misc +from scipy.ndimage.interpolation import affine_transform +import math +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE, SIG_DFL) + +parser = argparse.ArgumentParser( + description="""Generates and saves the feature vectors""") +parser.add_argument( + 'dir', type=str, help='directory of images.scp and is also output directory') +parser.add_argument('--seg', type=str, default='1', + help='JOB number of images.JOB.scp if run in parallel mode') +parser.add_argument('--out-ark', type=str, default='-', + help='where to write the output feature file') +parser.add_argument('--scale-size', type=int, default=40, + help='size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='size to scale the height of all images') +parser.add_argument('--vertical-shift', type=int, default=10, + help='total number of padding pixel per column') +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + + +def get_scaled_image(im): + scale_size = args.scale_size + sx = im.shape[1] # width + sy = im.shape[0] # height + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + padding_x = max(5, int((args.padding / 100) * im.shape[1])) + padding_y = im.shape[0] + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + im_pad1 = np.concatenate( + (im_pad, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + return im_pad1 + + +def contrast_normalization(im, low_pct, high_pct): + element_number = im.size + rows = im.shape[0] + cols = im.shape[1] + im_contrast = np.zeros(shape=im.shape) + low_index = int(low_pct * element_number) + high_index = int(high_pct * element_number) + sorted_im = np.sort(im, axis=None) + low_thred = sorted_im[low_index] + high_thred = sorted_im[high_index] + for i in range(rows): + for j in range(cols): + if im[i, j] > high_thred: + im_contrast[i, j] = 255 # lightest to white + elif im[i, j] < low_thred: + im_contrast[i, j] = 0 # darkest to black + else: + # linear normalization + im_contrast[i, j] = (im[i, j] - low_thred) * \ + 255 / (high_thred - low_thred) + return im_contrast + + +def geometric_moment(frame, p, q): + m = 0 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + m += (i ** p) * (j ** q) * frame[i][i] + return m + + +def central_moment(frame, p, q): + u = 0 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + for i in range(frame.shape[1]): + for j in range(frame.shape[0]): + u += ((i - x_bar)**p) * ((j - y_bar)**q) * frame[i][j] + return u + + +def height_normalization(frame, w, h): + frame_normalized = np.zeros(shape=(h, w)) + alpha = 4 + x_bar = geometric_moment(frame, 1, 0) / \ + geometric_moment(frame, 0, 0) # m10/m00 + y_bar = geometric_moment(frame, 0, 1) / \ + geometric_moment(frame, 0, 0) # m01/m00 + sigma_x = (alpha * ((central_moment(frame, 2, 0) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u20/m00) + sigma_y = (alpha * ((central_moment(frame, 0, 2) / + geometric_moment(frame, 0, 0)) ** .5)) # alpha * sqrt(u02/m00) + for x in range(w): + for y in range(h): + i = int((x / w - 0.5) * sigma_x + x_bar) + j = int((y / h - 0.5) * sigma_y + y_bar) + frame_normalized[x][y] = frame[i][j] + return frame_normalized + + +def find_slant(im): + rows = im.shape[0] + cols = im.shape[1] + sum_max = 0 + slant_degree = 0 + for shear_degree in range(-45, 45, 5): + sum = 0 + shear_rad = shear_degree / 360.0 * 2 * math.pi + shear_matrix = np.array([[1, 0], + [np.tan(shear_rad), 1]]) + sheared_im = affine_transform(im, shear_matrix, cval=255.0) + for j in range(cols): + foreground = (sheared_im[:, j] < 100) + number = np.sum(foreground) + # print(number) + if number != 0: + start_point = -1 + end_point = -1 + start_point = 0 + for i in range(rows): + if foreground[i] == 1: + start_point = i + break + for i in range(rows - 1, -1, -1): + if foreground[i] == 1: + end_point = i + break + length = end_point - start_point + 1 + #print(number, length) + if length == number: + sum = sum + number * number + #print(shear_degree, sum) + if sum > sum_max: + sum_max = sum + slant_degree = shear_degree + return slant_degree + + +def deslant(im, shear): + padding_x = int(abs(np.tan(shear)) * im.shape[0]) + padding_y = im.shape[0] + if shear > 0: + im_pad = np.concatenate( + (255 * np.ones((padding_y, padding_x), dtype=int), im), axis=1) + else: + im_pad = np.concatenate( + (im, 255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + + shear_matrix = np.array([[1, 0], + [np.tan(shear), 1]]) + # sheared_im = affine_transform(image, shear_matrix, output_shape=( + # im.shape[0], im.shape[1] + abs(int(im.shape[0] * np.tan(shear)))), cval=128.0) + sheared_im = affine_transform(im_pad, shear_matrix, cval=255.0) + return sheared_im + + +def vertical_shift(im, mode='mid'): + total = args.vertical_shift + if mode == 'mid': + top = total / 2 + bottom = total - top + elif mode == 'top': # more padding on top + top = random.randint(total / 2, total) + bottom = total - top + elif mode == 'bottom': # more padding on bottom + top = random.randint(0, total / 2) + bottom = total - top + width = im.shape[1] + im_pad = np.concatenate( + (255 * np.ones((top, width), dtype=int), im), axis=0) + im_pad = np.concatenate( + (im_pad, 255 * np.ones((bottom, width), dtype=int)), axis=0) + return im_pad + + +def image_augment(im, out_fh, image_id): + random.seed(1) + shift_setting = ['mid', 'top', 'bottom'] + image_shift_id = [] + for i in range(3): + image_shift_id.append(image_id + '_shift' + str(i + 1)) + im_shift = vertical_shift(im, shift_setting[i]) + im_scaled = get_scaled_image(im_shift) + data = np.transpose(im_scaled, (1, 0)) + data = np.divide(data, 255.0) + new_scp_list.append(image_id + '_shift' + str(i + 1)) + write_kaldi_matrix(out_fh, data, image_shift_id[i]) + + +# main # +new_scp_list = list() +text_file = os.path.join(args.dir, 'backup', 'text') +text_dict = dict() # stores imageID and text + +with open(text_file) as text_fh: + for uttID_text in text_fh: + uttID_text = uttID_text.strip() + uttID_text_vect = uttID_text.split(" ") + uttID = uttID_text_vect[0] + imageID = uttID.split("_")[1] + text_vect = uttID_text_vect[1:] + text = " ".join(text_vect) + text_dict[imageID] = text + +utt2spk_file = os.path.join(args.dir, 'backup', 'utt2spk') +uttID_spk_dict = dict() # stores imageID and speaker + +with open(utt2spk_file) as utt2spk_fh: + for uttID_spk in utt2spk_fh: + uttID_spk = uttID_spk.strip() + uttID_spk_vect = uttID_spk.split(" ") + uttID = uttID_spk_vect[0] + imageID = uttID.split("_")[1] + spk = uttID_spk_vect[1] + uttID_spk_dict[imageID] = spk + +image_file = os.path.join(args.dir, 'backup', 'images.scp') +uttID_path_dict = dict() # stores imageID and image path + +with open(image_file) as image_fh: + for uttID_path in image_fh: + uttID_path = uttID_path.strip() + uttID_path_vect = uttID_path.split(" ") + uttID = uttID_path_vect[0] + imageID = uttID.split("_")[1] + path = uttID_path_vect[1] + uttID_path_dict[imageID] = path + +scp_name = 'images.scp' +data_list_path = os.path.join(args.dir, 'backup', scp_name) + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark, 'wb') + +text_file = os.path.join(args.dir, 'text') +text_fh = open(text_file, 'w+') + +utt2spk_file = os.path.join(args.dir, 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w+') + +image_file = os.path.join(args.dir, 'images.scp') +image_fh = open(image_file, 'w+') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + #im_contrast = contrast_normalization(im, 0.05, 0.2) + #shear = (find_slant(im_contrast) / 360.0) * 2 * math.pi + im_scaled = get_scaled_image(im) + image_augment(im_scaled, out_fh, image_id) + +for uttID in new_scp_list: + imageID = uttID.split("_")[1] + text_fh.write(uttID + ' ' + text_dict[imageID] + '\n') + utt2spk_fh.write(uttID + ' ' + uttID_spk_dict[imageID] + '\n') + image_fh.write(uttID + ' ' + uttID_path_dict[imageID] + '\n') diff --git a/egs/iam/s5/local/chain/align_nnet3_lats.sh b/egs/iam/s5/local/chain/align_nnet3_lats.sh new file mode 100755 index 00000000000..cd81000ac37 --- /dev/null +++ b/egs/iam/s5/local/chain/align_nnet3_lats.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey) +# 2017 Hossein Hadian +# Apache 2.0 + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=1.0" +acoustic_scale=1.0 +beam=10 +final_beam=20 # For the lattice-generation phase there is no retry-beam. This + # is a limitation of gmm-latgen-faster. We just use an + # intermediate beam. We'll lose a little data and it will be + # slightly slower. (however, the min-active of 200 that + # gmm-latgen-faster defaults to may help.) +post_decode_acwt=12.0 # can be used in 'chain' systems to scale acoustics by 10 so the + # regular scoring script works. +frames_per_chunk=50 +extra_left_context=0 +extra_right_context=0 +extra_left_context_initial=-1 +extra_right_context_final=-1 + +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_nnet3_lats.sh " + echo "e.g.: steps/align_nnet3_lats.sh data/train data/lang exp/tri1 exp/tri1_lats" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. + +feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + +tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; +frame_subsampling_opt= +if [ -f $srcdir/frame_subsampling_factor ]; then + # e.g. for 'chain' systems + frame_subsampling_factor=$(cat $srcdir/frame_subsampling_factor) + frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" + cp $srcdir/frame_subsampling_factor $dir +fi + +model=$srcdir/final.mdl +if [ "$post_decode_acwt" == 1.0 ]; then + lat_wspecifier="ark:|gzip -c >$dir/lat.JOB.gz" +else + lat_wspecifier="ark:|lattice-scale --acoustic-scale=$post_decode_acwt ark:- ark:- | gzip -c >$dir/lat.JOB.gz" +fi + +if [ $stage -le 1 ]; then + echo "$0: generating lattices containing alternate pronunciations." + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int \ + $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" ark:- \| \ + nnet3-latgen-faster $frame_subsampling_opt \ + --frames-per-chunk=$frames_per_chunk \ + --extra-left-context=$extra_left_context \ + --extra-right-context=$extra_right_context \ + --extra-left-context-initial=$extra_left_context_initial \ + --extra-right-context-final=$extra_right_context_final \ + --minimize=false --acoustic-scale=$acoustic_scale --beam=$final_beam \ + --lattice-beam=$final_beam --allow-partial=false \ + --word-determinize=false --word-symbol-table=$lang/words.txt \ + $model ark:- "$feats" \ + "$lat_wspecifier" || exit 1; +fi + +echo "$0: done generating lattices from training transcripts." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/iam/s5/local/chain/compare_wer.sh b/egs/iam/s5/local/chain/compare_wer.sh new file mode 100755 index 00000000000..8cd85ecdf01 --- /dev/null +++ b/egs/iam/s5/local/chain/compare_wer.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo diff --git a/egs/iam/s5/local/chain/run_cnn_1a.sh b/egs/iam/s5/local/chain/run_cnn_1a.sh new file mode 100755 index 00000000000..02df03d1cd6 --- /dev/null +++ b/egs/iam/s5/local/chain/run_cnn_1a.sh @@ -0,0 +1,231 @@ +#!/bin/bash + +# steps/info/chain_dir_info.pl exp/chain/cnn_1a/ +# exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->380 combine=-0.033->-0.025 xent:train/valid[13,20,final]=(-1.07,-1.31,-0.560/-1.30,-1.70,-0.978) logprob:train/valid[13,20,final]=(-0.064,-0.119,-0.011/-0.115,-0.208,-0.096) + +# head exp/chain/cnn_1a/decode_test/scoring_kaldi/best_wer +#%WER 18.34 [ 3231 / 17616, 348 ins, 693 del, 2190 sub ] exp/chain/cnn_1a/decode_test/wer_8_1.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=32 +chunk_right_context=32 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/$lang_test $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$frame_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 32 \ + --extra-right-context-final 32 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh b/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh new file mode 100755 index 00000000000..0f0296d7af6 --- /dev/null +++ b/egs/iam/s5/local/chain/run_cnn_chainali_1a.sh @@ -0,0 +1,236 @@ +#!/bin/bash + +# chainali_1a uses chain model for lattice instead of gmm-hmm model. It has more cnn layers as compared to 1a +# (18.34% -> 13.68%) + +# steps/info/chain_dir_info.pl exp/chain/cnn1a_chainali/ +# exp/chain/cnn_chainali_1a/: num-iters=21 nj=2..4 num-params=3.8M dim=40->380 combine=-0.009->-0.006 xent:train/valid[13,20,final]=(-0.870,-0.593,-0.568/-1.08,-0.889,-0.874) logprob:train/valid[13,20,final]=(-0.035,-0.003,-0.001/-0.077,-0.055,-0.054) + +# head exp/chain/cnn_chainali_1a/decode_test/scoring_kaldi/best_wer +# %WER 13.68 [ 2410 / 17616, 243 ins, 633 del, 1534 sub ] exp/chain/cnn_chainali_1a/decode_test/wer_8_1.0 + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +gmm=tri3 # this is the source gmm-dir that we'll use for alignments; it + # should have alignments for the specified training data. +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +ali=tri3_ali +chain_model_dir=exp/chain${nnet3_affix}/cnn${affix} +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +alignment_subsampling_factor=1 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=32 +chunk_right_context=32 +tdnn_dim=450 +# training options +srand=0 +remove_egs=false +lang_test=lang_test +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + local/chain/align_nnet3_lats.sh --nj $nj --cmd "$train_cmd" ${lores_train_data_dir} \ + data/$lang_test $chain_model_dir $lat_dir + cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" $num_leaves ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + common1="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="required-time-offsets=0 height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="required-time-offsets=0 height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn4 dim=$tdnn_dim target-rms=0.5 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=$alignment_subsampling_factor \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 32 \ + --extra-right-context-final 32 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$decode_cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/s5/local/iam_train_lm.sh b/egs/iam/s5/local/iam_train_lm.sh new file mode 100755 index 00000000000..e7ba2295b13 --- /dev/null +++ b/egs/iam/s5/local/iam_train_lm.sh @@ -0,0 +1,140 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# Apache 2.0 +# +# This script trains a LM on the Cantab-Tedlium text data and tedlium acoustic training data. +# It is based on the example scripts distributed with PocoLM + +# It will first check if pocolm is installed and if not will process with installation +# It will then get the source data from the pre-downloaded Cantab-Tedlium files +# and the pre-prepared data/train text source. + + +set -e +stage=0 + +echo "$0 $@" # Print the command line for logging +. utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + +. ./path.sh +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +num_dev_sentences=6161 +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + head -86858 data/download/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt + cat data/download/browncorpus/brown.txt >> ${dir}/data/text/text.txt + #cat test_words.txt >> ${dir}/data/text/text.txt + # use a subset of the annotated training data as the dev set . + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + # .. and the rest of the training data as an additional data source. + # we can later fold the dev data into this. + head -n $[$num_dev_sentences] < data/train/text | cut -d " " -f 2- > ${dir}/data/text/ted.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (a subset of the training data is used as ${dir}/data/text/ted.txt to work + # out interpolation weights. + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get wordlist + cat ${dir}/data/text/text.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=3 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='train=2 ted=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + --fold-dev-into=ted ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' + #log-prob: -5.05603614242 [perplexity = 156.967086371] over 19477.0 words +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_big was -5.06654404785 per word [perplexity = 158.625177948] over 19477.0 words + # current results, after adding --limit-unk-history=true: + + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + # get_data_prob.py: log-prob of data/local/local_lm/data/real_dev_set.txt given model data/local/local_lm/data/lm_3_prune_small was -5.24719139498 per word [perplexity = 190.031793995] over 19477.0 words + # current results, after adding --limit-unk-history=true (needed for modeling OOVs and not blowing up LG.fst): + + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/s5/local/make_feature_vect.py b/egs/iam/s5/local/make_feature_vect.py new file mode 100755 index 00000000000..dd35f1b14c7 --- /dev/null +++ b/egs/iam/s5/local/make_feature_vect.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python + +import argparse +import os +import sys +import scipy.io as sio +import numpy as np +from scipy import misc + +from signal import signal, SIGPIPE, SIG_DFL +signal(SIGPIPE,SIG_DFL) + +parser = argparse.ArgumentParser(description="""Generates and saves the feature vectors""") +parser.add_argument('dir', type=str, help='directory of images.scp and is also output directory') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output feature file') +parser.add_argument('--scale-size', type=int, default=40, help='size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, help='size to scale the height of all images') +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im): + scale_size = args.scale_size + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + padding_x = max(5,int((args.padding/100)*im.shape[1])) + padding_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((padding_y,padding_x), dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad,255 * np.ones((padding_y, padding_x), dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir,'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scale = get_scaled_image(im) + + data = np.transpose(im_scale, (1, 0)) + data = np.divide(data, 255.0) + write_kaldi_matrix(out_fh, data, image_id) + diff --git a/egs/iam/s5/local/prepare_data.sh b/egs/iam/s5/local/prepare_data.sh new file mode 100755 index 00000000000..2c79e67cc89 --- /dev/null +++ b/egs/iam/s5/local/prepare_data.sh @@ -0,0 +1,135 @@ +#!/bin/bash +# This script loads the IAM handwritten dataset + +stage=0 +nj=20 +dir=data + +. ./cmd.sh +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +#download dir +add_val_data_train=false +dl_dir=data/download +lines=$dl_dir/lines +xml=$dl_dir/xml +ascii=$dl_dir/ascii +bcorpus=$dl_dir/browncorpus +lobcorpus=$dl_dir/lobcorpus +data_split_info=$dl_dir/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +mkdir -p $dl_dir +#download and extact images and transcription +if [ -d $lines ]; then + echo Not downloading lines images as it is already there. +else + if [ ! -f $dl_dir/lines.tgz ]; then + echo Downloading lines images... + wget -P $dl_dir --user userjh --password password $lines_url || exit 1; + fi + mkdir -p $lines + tar -xvzf $dl_dir/lines.tgz -C $lines || exit 1; + echo Done downloading and extracting lines images +fi + +if [ -d $xml ]; then + echo Not downloading transcription as it is already there. +else + if [ ! -f $dl_dir/xml.tgz ]; then + echo Downloading transcription ... + wget -P $dl_dir --user userjh --password password $xml_url || exit 1; + fi + mkdir -p $xml + tar -xvzf $dl_dir/xml.tgz -C $xml || exit 1; + echo Done downloading and extracting transcription +fi + +if [ -d $data_split_info ]; then + echo Not downloading data split, training and testing split, information as it is already there. +else + if [ ! -f $dl_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo Downloading training and testing data Split Information ... + wget -P $dl_dir --user userjh --password password $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $dl_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo Done downloading and extracting training and testing data Split Information +fi + +if [ -d $ascii ]; then + echo Not downloading ascii folder as it is already there. +else + if [ ! -f $dl_dir/ascii.tgz ]; then + echo Downloading ascii folder ... + wget -P $dl_dir --user userjh --password password $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xvzf $dl_dir/ascii.tgz -C $ascii || exit 1; + echo Done downloading and extracting ascii folder +fi + +if [ -d $lobcorpus ]; then + echo Not downloading lob corpus as it is already there. +else + if [ ! -f $dl_dir/0167.zip ]; then + echo Downloading lob corpus ... + wget -P $dl_dir $lob_corpus_url || exit 1; + fi + mkdir -p $lobcorpus + unzip $dl_dir/0167.zip -d $lobcorpus || exit 1; + echo Done downloading and extracting lob corpus +fi + +if [ -d $bcorpus ]; then + echo Not downloading brown corpus as it is already there. +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo Downloading brown corpus ... + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo Done downloading brown corpus +fi + +mkdir -p $dir/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask +testset=testset.txt +trainset=trainset.txt +val1=validationset1.txt +val2=validationset2.txt +test_path="$dl_dir/$file_name/$testset" +train_path="$dl_dir/$file_name/$trainset" +val1_path="$dl_dir/$file_name/$val1" +val2_path="$dl_dir/$file_name/$val2" + +new_train_set=new_trainset.txt +new_test_set=new_testset.txt +new_val_set=new_valset.txt +new_train_path="$dir/$new_train_set" +new_test_path="$dir/$new_test_set" +new_val_path="$dir/$new_val_set" + +if [ $add_val_data_train = true ]; then + cat $train_path $val1_path $val2_path > $new_train_path + cat $test_path > $new_test_path + cat $val1_path $val2_path > $new_val_path +else + cat $train_path > $new_train_path + cat $test_path > $new_test_path + cat $val1_path $val2_path > $new_val_path +fi + +if [ $stage -le 0 ]; then + local/process_data.py $dl_dir $dir/train $dir --dataset new_trainset --model_type word || exit 1 + local/process_data.py $dl_dir $dir/test $dir --dataset new_testset --model_type word || exit 1 + local/process_data.py $dl_dir $dir/val $dir --dataset new_valset --model_type word || exit 1 + + utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk > $dir/train/spk2utt + utils/utt2spk_to_spk2utt.pl $dir/test/utt2spk > $dir/test/spk2utt +fi diff --git a/egs/iam/s5/local/prepare_dict.sh b/egs/iam/s5/local/prepare_dict.sh new file mode 100755 index 00000000000..5d13bdc5d8b --- /dev/null +++ b/egs/iam/s5/local/prepare_dict.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +train_text=$1 +test_text=$2 +dir=$3 + +mkdir -p $dir + +local/prepare_lexicon.py $train_text $test_text $dir + +cut -d' ' -f2- $dir/lexicon.txt | tr ' ' '\n' | sort -u >$dir/nonsilence_phones.txt || exit 1; + +( echo ' SIL'; ) >> $dir/lexicon.txt || exit 1; +( echo ' SIL'; ) >> $dir/lexicon.txt || exit 1; + +( echo SIL ) > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/s5/local/prepare_lexicon.py b/egs/iam/s5/local/prepare_lexicon.py new file mode 100755 index 00000000000..86298c45733 --- /dev/null +++ b/egs/iam/s5/local/prepare_lexicon.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import argparse +import os +import sys + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('database_path', type=str, help='path to train text file') +parser.add_argument('test_text', type=str, help='path to test text file to include it in lexicon') +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +char = {} +lex = {} + +text_path = os.path.join(args.database_path,'text') +with open(text_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + for i in range(1,len(line_vect)): + characters = list(line_vect[i]) + entry = " ".join(characters) + entry = entry.replace("#", "") + if line_vect[i]: + lex[line_vect[i]] = entry + +if args.test_text > 1: + text_path = os.path.join(args.test_text,'text') + with open(text_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + for i in range(1,len(line_vect)): + characters = list(line_vect[i]) + entry = " ".join(characters) + entry = entry.replace("#", "") + if line_vect[i]: + lex[line_vect[i]] = entry + + +lex_file = os.path.join(args.dir, 'lexicon.txt') +lex_fh = open(lex_file, 'w+') +for key in sorted(lex): + lex_fh.write(key + " " + lex[key] + "\n") diff --git a/egs/iam/s5/local/prepare_lm.sh b/egs/iam/s5/local/prepare_lm.sh new file mode 100755 index 00000000000..59178b881fc --- /dev/null +++ b/egs/iam/s5/local/prepare_lm.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Copyright 2015 Hossein Hadian +# Apache 2.0 + + +ngram=2 + +[ -f ./path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1 + +# This is only called for decoding purposes. This is not necessary for training. +if [ $# != 2 ]; then + echo "Usage: $0 [--ngram ] " + echo " can either be an ARPA file or transcriptions file, e.g. train/text" + exit 1; +fi + +lmsrc=$1 +lang=$2 + +tmpdir=$lang/temp +mkdir -p $tmpdir + +if [[ $lmsrc == *.arpa ]]; then + echo "$0: Using arpa LM: "$lmsrc + arpa=$lmsrc +else + echo "$0: No Arpa file provided..Creating a $ngram-gram LM from provided text: "$lmsrc; + if [ -z $IRSTLM ]; then + export IRSTLM=$PWD/../../../tools/irstlm + export PATH=$PWD/../../../tools/irstlm/bin:$PATH + echo "$0: Assuming irstlm is at: "$IRSTLM + fi + ! which build-lm.sh 2>/dev/null && \ + echo "$0: IRSTLM does not seem to be installed (build-lm.sh not on your path): " && \ + echo "go to /tools and try 'make irstlm_tgt'" && exit 1; + + cut -d' ' -f2- $lmsrc | sed -e 's:^: :' -e 's:$: :' \ + > $tmpdir/tmp_lm_train + build-lm.sh -k 1 -i $tmpdir/tmp_lm_train -n $ngram -o $tmpdir/tmp_lm.ilm.gz + + compile-lm $tmpdir/tmp_lm.ilm.gz -t=yes /dev/stdout | \ + grep -v unk > $tmpdir/lm_phone_bg.arpa + + arpa=$tmpdir/lm_phone_bg.arpa +fi + +cat $arpa | utils/find_arpa_oovs.pl $lang/words.txt > $tmpdir/oovs.txt + +cat $arpa | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + arpa2fst - | fstprint | \ + utils/remove_oovs.pl $tmpdir/oovs.txt | \ + utils/eps2disambig.pl | utils/s2eps.pl | \ + fstcompile --isymbols=$lang/words.txt --osymbols=$lang/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel >$lang/G.fst || exit 1; + +echo "$0: Done preparing the LM" diff --git a/egs/iam/s5/local/process_data.py b/egs/iam/s5/local/process_data.py new file mode 100755 index 00000000000..f9838d34563 --- /dev/null +++ b/egs/iam/s5/local/process_data.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +import argparse +import os +import sys +import numpy as np +from scipy import misc +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text utt2spk + and image file """) +parser.add_argument('database_path', type=str, + help='path to downloaded iam data') +parser.add_argument('out_dir', type=str, + help='where to write output files') +parser.add_argument('dataset_dir', type=str, + help='directory containing dataset') +parser.add_argument('--dataset', type=str, default='new_trainset', + choices=['new_trainset', 'new_testset','new_valset'], + help='choose new_trainset, testset') +parser.add_argument('--model_type', type=str,default='word', + choices=['word', 'character'], + help='word model or character model') +args = parser.parse_args() + +### main ### +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.dataset_dir, + args.dataset + '.txt') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + line_vect = line.split(' ') + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[line_vect[0]] = text + +def process_text_file_for_char_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + line_vect = line.split(' ') + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + characters = list(text) + spaced_characters = " ".join(characters) + spaced_characters = spaced_characters.replace("|", "SIL") + spaced_characters = "SIL " + spaced_characters + spaced_characters = spaced_characters + " SIL" + text_dict[line_vect[0]] = spaced_characters + + +if args.model_type=='word': + print 'processing word model' + process_text_file_for_word_model() +else: + print 'processing char model' + process_text_file_for_char_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/s5/local/run_unk_model.sh b/egs/iam/s5/local/run_unk_model.sh new file mode 100755 index 00000000000..b8ed377370a --- /dev/null +++ b/egs/iam/s5/local/run_unk_model.sh @@ -0,0 +1,42 @@ +#!/bin/bash + + +utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 data/train/dict exp/unk_lang_model + +utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \ + --unk-fst exp/unk_lang_model/unk_fst.txt data/train/dict "" data/lang/temp data/lang_unk + +# note: it's important that the LM we built in data/lang/G.fst was created using +# pocolm with the option --limit-unk-history=true (see ted_train_lm.sh). This +# keeps the graph compact after adding the unk model (we only have to add one +# copy of it). + +cp data/lang_test_corpus/G.fst data/lang_unk/G.fst +#utils/mkgraph.sh data/lang_unk exp/tri3 exp/tri3/graph_unk + +. ./cmd.sh + +## Caution: if you use this unk-model stuff, be sure that the scoring script +## does not use lattice-align-words-lexicon, because it's not compatible with +## the unk-model. Instead you should use lattice-align-words (of course, this +## only works if you have position-dependent phones). + +#decode_=30 +#for dset in dev test; do +# steps/decode_fmllr.sh --nj $decode_nj --cmd "$decode_cmd" --num-threads 4 \ +# exp/tri3/graph_unk data/${dset} exp/tri3/decode_${dset}_unk +# steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" data/lang data/lang_rescore \ +# data/${dset} exp/tri3/decode_${dset}_unk exp/tri3/decode_${dset}_unk_rescore +#done +# +#frames_per_chunk=$(echo $chunk_width | cut -d, -f1) +# steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ +# --extra-left-context $chunk_left_context \ +# --extra-right-context $chunk_right_context \ +# --extra-left-context-initial 0 \ +# --extra-right-context-final 0 \ +# --frames-per-chunk $frames_per_chunk \ +# --nj $nj --cmd "$decode_cmd" \ +# $dir/graph data/test $dir/lang_test_corpus_c || exit 1 +# +# # for x in exp/tri3/decode*; do grep Sum $x/*/*ys | utils/best_wer.sh ; done | grep -v old | grep -v si diff --git a/egs/iam/s5/local/score.sh b/egs/iam/s5/local/score.sh new file mode 100755 index 00000000000..518bd6d0459 --- /dev/null +++ b/egs/iam/s5/local/score.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 + +# See the script steps/scoring/score_kaldi_cer.sh in case you need to evalutate CER + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=7 +max_lmwt=17 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-1best ark:- ark:- \| \ + lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \ + lattice-arc-post $model_path/final.mdl ark:- - \| \ + local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + tr '[:upper:]' '[:lower:]' \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt| tr '[:upper:]' '[:lower:]' |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/iam/s5/local/unk_arc_post_to_transcription.py b/egs/iam/s5/local/unk_arc_post_to_transcription.py new file mode 100755 index 00000000000..c27bf226cf9 --- /dev/null +++ b/egs/iam/s5/local/unk_arc_post_to_transcription.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python + +import argparse +import os +import sys +import numpy as np +from scipy import misc +parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""") +parser.add_argument('phones', type=str, help='phones and phonesID') +parser.add_argument('words', type=str, help='word and wordID') +parser.add_argument('unk', type=str, default='-', help='location of unk file') +parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data') +parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data') +args = parser.parse_args() +### main ### +phone_fh = open(args.phones, 'r') +word_fh = open(args.words, 'r') +unk_fh = open(args.unk,'r') +if args.input_ark == '-': + input_fh = sys.stdin +else: + input_fh = open(args.input_ark,'r') +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +phone_dict = dict()# stores phoneID and phone mapping +phone_data_vect = phone_fh.read().strip().split("\n") +for key_val in phone_data_vect: + key_val = key_val.split(" ") + phone_dict[key_val[1]] = key_val[0] +word_dict = dict() +word_data_vect = word_fh.read().strip().split("\n") +for key_val in word_data_vect: + key_val = key_val.split(" ") + word_dict[key_val[1]] = key_val[0] +unk_val = unk_fh.read().strip().split(" ")[0] + +utt_word_dict = dict() +utt_phone_dict = dict()# stores utteranceID and phoneID +unk_word_dict = dict() +count=0 +for line in input_fh: + line_vect = line.strip().split("\t") + if len(line_vect) < 6: + print "IndexError" + print line_vect + continue + uttID = line_vect[0] + word = line_vect[4] + phones = line_vect[5] + if uttID in utt_word_dict.keys(): + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + else: + count = 0 + utt_word_dict[uttID] = dict() + utt_phone_dict[uttID] = dict() + utt_word_dict[uttID][count] = word + utt_phone_dict[uttID][count] = phones + if word == unk_val: # get character sequence for unk + phone_key_vect = phones.split(" ") + phone_val_vect = list() + for pkey in phone_key_vect: + phone_val_vect.append(phone_dict[pkey]) + phone_2_word = list() + for phone_val in phone_val_vect: + phone_2_word.append(phone_val.split('_')[0]) + phone_2_word = ''.join(phone_2_word) + utt_word_dict[uttID][count] = phone_2_word + else: + if word == '0': + word_val = ' ' + else: + word_val = word_dict[word] + utt_word_dict[uttID][count] = word_val + count += 1 + +transcription = "" +for key in sorted(utt_word_dict.iterkeys()): + transcription = key + for index in sorted(utt_word_dict[key].iterkeys()): + value = utt_word_dict[key][index] + transcription = transcription + " " + value + out_fh.write(transcription + '\n') diff --git a/egs/iam/s5/path.sh b/egs/iam/s5/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/iam/s5/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/iam/s5/run.sh b/egs/iam/s5/run.sh new file mode 100755 index 00000000000..d1eeda7e0d1 --- /dev/null +++ b/egs/iam/s5/run.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +stage=0 +nj=20 +color=1 +data_dir=data +exp_dir=exp +augment=false +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. utils/parse_options.sh # e.g. this parses the --stage option if supplied. + +if [ $stage -le 0 ]; then + local/prepare_data.sh --nj $nj --dir $data_dir +fi +mkdir -p $data_dir/{train,test}/data + +if [ $stage -le 1 ]; then + local/make_feature_vect.py $data_dir/test --scale-size 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$data_dir/test/data/images.ark,$data_dir/test/feats.scp || exit 1 + steps/compute_cmvn_stats.sh $data_dir/test || exit 1; + + if [ $augment = true ]; then + # create a backup directory to store text, utt2spk and image.scp file + mkdir -p $data_dir/train/backup + mv $data_dir/train/text $data_dir/train/utt2spk $data_dir/train/images.scp $data_dir/train/backup/ + local/augment_and_make_feature_vect.py $data_dir/train --scale-size 40 --vertical-shift 10 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$data_dir/train/data/images.ark,$data_dir/train/feats.scp || exit 1 + utils/utt2spk_to_spk2utt.pl $data_dir/train/utt2spk > $data_dir/train/spk2utt + else + local/make_feature_vect.py $data_dir/train --scale-size 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$data_dir/train/data/images.ark,$data_dir/train/feats.scp || exit 1 + fi + steps/compute_cmvn_stats.sh $data_dir/train || exit 1; +fi + +numSilStates=4 +numStates=8 + +if [ $stage -le 2 ]; then + local/prepare_dict.sh $data_dir/train/ $data_dir/test/ $data_dir/train/dict + utils/prepare_lang.sh --num-sil-states $numSilStates --num-nonsil-states $numStates \ + $data_dir/train/dict "" $data_dir/lang/temp $data_dir/lang +fi + +if [ $stage -le 3 ]; then + local/iam_train_lm.sh + cp -R $data_dir/lang -T $data_dir/lang_test_corpus + gunzip -k -f data/local/local_lm/data/arpa/3gram_big.arpa.gz + local/prepare_lm.sh data/local/local_lm/data/arpa/3gram_big.arpa $data_dir/lang_test_corpus || exit 1; + local/run_unk_model.sh +fi + +num_gauss=10000 +numLeavesTri=500 +numGaussTri=20000 + +if [ $stage -le 4 ]; then + steps/train_mono.sh --nj $nj --cmd $cmd \ + --totgauss $num_gauss \ + $data_dir/train \ + $data_dir/lang \ + $exp_dir/mono +fi + +if [ $stage -le 5 ]; then + utils/mkgraph.sh --mono $data_dir/lang_test_corpus \ + $exp_dir/mono \ + $exp_dir/mono/graph + steps/decode.sh --nj $nj --cmd $cmd \ + $exp_dir/mono/graph \ + $data_dir/test \ + $exp_dir/mono/decode_test +fi + +if [ $stage -le 6 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + $data_dir/train $data_dir/lang \ + $exp_dir/mono \ + $exp_dir/mono_ali + steps/train_deltas.sh --cmd $cmd \ + $numLeavesTri $numGaussTri $data_dir/train $data_dir/lang \ + $exp_dir/mono_ali \ + $exp_dir/tri +fi + +if [ $stage -le 7 ]; then + utils/mkgraph.sh $data_dir/lang_test_corpus \ + $exp_dir/tri \ + $exp_dir/tri/graph + steps/decode.sh --nj $nj --cmd $cmd \ + $exp_dir/tri/graph \ + $data_dir/test \ + $exp_dir/tri/decode_test +fi + +if [ $stage -le 8 ]; then + steps/align_si.sh --nj $nj --cmd $cmd \ + $data_dir/train $data_dir/lang \ + $exp_dir/tri \ + $exp_dir/tri_ali + steps/train_lda_mllt.sh --cmd $cmd \ + --splice-opts "--left-context=3 --right-context=3" \ + $numLeavesTri $numGaussTri \ + $data_dir/train $data_dir/lang \ + $exp_dir/tri_ali $exp_dir/tri2 +fi + +if [ $stage -le 9 ]; then + utils/mkgraph.sh $data_dir/lang_test_corpus \ + $exp_dir/tri2 \ + $exp_dir/tri2/graph + steps/decode.sh --nj $nj --cmd $cmd \ + $exp_dir/tri2/graph \ + $data_dir/test \ + $exp_dir/tri2/decode_test +fi + +if [ $stage -le 10 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd \ + --use-graphs true \ + $data_dir/train $data_dir/lang \ + $exp_dir/tri2 \ + $exp_dir/tri2_ali + steps/train_sat.sh --cmd $cmd \ + $numLeavesTri $numGaussTri \ + $data_dir/train $data_dir/lang \ + $exp_dir/tri2_ali $exp_dir/tri3 +fi + +if [ $stage -le 11 ]; then + utils/mkgraph.sh $data_dir/lang_test_corpus \ + $exp_dir/tri3 \ + $exp_dir/tri3/graph + steps/decode_fmllr.sh --nj $nj --cmd $cmd \ + $exp_dir/tri3/graph \ + $data_dir/test \ + $exp_dir/tri3/decode_test +fi + +if [ $stage -le 12 ]; then + steps/align_fmllr.sh --nj $nj --cmd $cmd \ + --use-graphs true \ + $data_dir/train $data_dir/lang \ + $exp_dir/tri3 \ + $exp_dir/tri3_ali +fi + +affix=_1a +nnet3_affix= +if [ $stage -le 13 ]; then + local/chain/run_cnn_1a.sh --stage 0 \ + --gmm tri3 \ + --ali tri3_ali \ + --affix $affix \ + --nnet3_affix $nnet3_affix \ + --lang_test lang_unk +fi + +if [ $stage -le 14 ]; then + local/chain/run_cnn_chainali_1a.sh --stage 0 \ + --gmm tri3 \ + --ali tri3_ali \ + --affix $affix \ + --nnet3_affix $nnet3_affix \ + --chain_model_dir $exp_dir/chain${nnet3_affix}/cnn${affix} \ + --lang_test lang_unk +fi diff --git a/egs/iam/s5/steps b/egs/iam/s5/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/s5/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/s5/utils b/egs/iam/s5/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/s5/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file