From 52c6721da08493c0f6d5b07e3bb690a18a96365b Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 18 Mar 2018 22:21:35 -0400
Subject: [PATCH 1/9] Add initial scripts for e2e ocr - not cleaned

---
 egs/cifar/v1/image/get_allowed_lengths.py  | 312 +++++++++++++++++++++
 egs/cifar/v1/image/get_image2num_frames.py |  64 +++++
 egs/iam/v1/run_end2end.sh                  |  76 +++++
 3 files changed, 452 insertions(+)
 create mode 100755 egs/cifar/v1/image/get_allowed_lengths.py
 create mode 100755 egs/cifar/v1/image/get_image2num_frames.py
 create mode 100755 egs/iam/v1/run_end2end.sh

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
new file mode 100755
index 00000000000..07db16e2238
--- /dev/null
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -0,0 +1,312 @@
+#!/usr/bin/env python
+
+# Copyright     2017  Hossein Hadian
+# Apache 2.0
+
+
+""" This script perturbs speeds of utterances to force their lengths to some
+    allowed lengths spaced by a factor (like 10%)
+"""
+
+import argparse
+import os
+import sys
+import copy
+import math
+import logging
+
+sys.path.insert(0, 'steps')
+import libs.common as common_lib
+
+logger = logging.getLogger('libs')
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("%(asctime)s [%(pathname)s:%(lineno)s - "
+                              "%(funcName)s - %(levelname)s ] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+def get_args():
+    parser = argparse.ArgumentParser(description="""This script copies the 'srcdir'
+                                   data directory to output data directory 'dir'
+                                   while modifying the utterances so that there are
+                                   3 copies of each utterance: one with the same
+                                   speed, one with a higher speed (not more than
+                                   factor% faster) and one with a lower speed
+                                   (not more than factor% slower)""")
+    parser.add_argument('factor', type=float, default=12,
+                        help='Spacing (in percentage) between allowed lengths.')
+    parser.add_argument('srcdir', type=str,
+                        help='path to source data dir')
+    parser.add_argument('--coverage-factor', type=float, default=0.05,
+                        help="""Percentage of durations not covered from each
+                             side of duration histogram.""")
+    parser.add_argument('--frame-subsampling-factor', type=int, default=3,
+                        help="""Chain frame subsampling factor.
+                             See steps/nnet3/chain/train.py""")
+
+    args = parser.parse_args()
+    return args
+
+class Utterance:
+    """ This class represents a Kaldi utterance
+        in a data directory like data/train
+    """
+
+    def __init__(self, uid, wavefile, speaker, transcription, dur):
+        self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else
+                         'cat {} |'.format(wavefile))
+        self.speaker = speaker
+        self.transcription = transcription
+        self.id = uid
+        self.dur = float(dur)
+
+    def to_kaldi_utt_str(self):
+        return self.id + " " + self.transcription
+
+    def to_kaldi_wave_str(self):
+        return self.id + " " + self.wavefile
+
+    def to_kaldi_dur_str(self):
+        return "{} {:0.3f}".format(self.id, self.dur)
+
+
+def read_kaldi_datadir(dir):
+    """ Read a data directory like
+        data/train as a list of utterances
+    """
+
+    # check to make sure that no segments file exists as this script won't work
+    # with data directories which use a segments file.
+    if os.path.isfile(os.path.join(dir, 'segments')):
+        logger.info("The data directory '{}' seems to use a 'segments' file. "
+                    "This script does not yet support a 'segments' file. You'll need "
+                    "to use utils/data/extract_wav_segments_data_dir.sh "
+                    "to convert the data dir so it does not use a 'segments' file. "
+                    "Exiting...".format(dir))
+        sys.exit(1)
+
+    logger.info("Loading the data from {}...".format(dir))
+    utterances = []
+    wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp'))
+    text = read_kaldi_mapfile(os.path.join(dir, 'text'))
+    utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur'))
+    utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk'))
+
+    num_fail = 0
+    for utt in wav_scp:
+        if utt in text and utt in utt2dur and utt in utt2spk:
+            utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt],
+                                  text[utt], utt2dur[utt]))
+        else:
+            num_fail += 1
+
+    if len(utterances) / len(wav_scp) < 0.5:
+        logger.info("More than half your data is problematic. Try "
+                    "fixing using fix_data_dir.sh.")
+        sys.exit(1)
+
+    logger.info("Successfully read {} utterances. Failed for {} "
+                "utterances.".format(len(utterances), num_fail))
+    return utterances
+
+
+def read_kaldi_mapfile(path):
+    """ Read any Kaldi mapping file - like text, .scp files, etc.
+    """
+
+    m = {}
+    with open(path, 'r') as f:
+        for line in f:
+            line = line.strip()
+            sp_pos = line.find(' ')
+            key = line[:sp_pos]
+            val = line[sp_pos+1:]
+            m[key] = val
+    return m
+
+def generate_kaldi_data_files(utterances, outdir):
+    """ Write out a list of utterances as Kaldi data files into an
+        output data directory.
+    """
+
+    logger.info("Exporting to {}...".format(outdir))
+    speakers = {}
+
+    with open(os.path.join(outdir, 'text'), 'w') as f:
+        for utt in utterances:
+            f.write(utt.to_kaldi_utt_str() + "\n")
+
+    with open(os.path.join(outdir, 'wav.scp'), 'w') as f:
+        for utt in utterances:
+            f.write(utt.to_kaldi_wave_str() + "\n")
+
+    with open(os.path.join(outdir, 'utt2dur'), 'w') as f:
+        for utt in utterances:
+            f.write(utt.to_kaldi_dur_str() + "\n")
+
+    with open(os.path.join(outdir, 'utt2spk'), 'w') as f:
+        for utt in utterances:
+            f.write(utt.id + " " + utt.speaker + "\n")
+            if utt.speaker not in speakers:
+                speakers[utt.speaker] = [utt.id]
+            else:
+                speakers[utt.speaker].append(utt.id)
+
+    with open(os.path.join(outdir, 'spk2utt'), 'w') as f:
+        for s in speakers:
+            f.write(s + " ")
+            for utt in speakers[s]:
+                f.write(utt + " ")
+            f.write('\n')
+
+    logger.info("Successfully wrote {} utterances to data "
+                "directory '{}'".format(len(utterances), outdir))
+
+def find_duration_range(img2len, coverage_factor):
+    """Given a list of utterances, find the start and end duration to cover
+
+     If we try to cover
+     all durations which occur in the training set, the number of
+     allowed lengths could become very large.
+
+     Returns
+     -------
+     start_dur: int
+     end_dur: int
+    """
+    durs = []
+    for im, imlen in img2len.items():
+        durs.append(int(imlen))
+    durs.sort()
+    to_ignore_dur = 0
+    tot_dur = sum(durs)
+    for d in durs:
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            start_dur = d
+            break
+    to_ignore_dur = 0
+    for d in reversed(durs):
+        to_ignore_dur += d
+        if to_ignore_dur * 100.0 / tot_dur > coverage_factor:
+            end_dur = d
+            break
+    if start_dur < 30:
+        start_dur = 30  # a hard limit to avoid too many allowed lengths --not critical
+    return start_dur, end_dur
+
+
+def find_allowed_durations(start_len, end_len, args):
+    """Given the start and end duration, find a set of
+       allowed durations spaced by args.factor%. Also write
+       out the list of allowed durations and the corresponding
+       allowed lengths (in frames) on disk.
+
+     Returns
+     -------
+     allowed_durations: list of allowed durations (in seconds)
+    """
+
+    allowed_lengths = []
+    length = start_len
+    with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'wb') as fp:
+        while length < end_len:
+            if length % args.frame_subsampling_factor != 0:
+                length = (args.frame_subsampling_factor *
+                          (length // args.frame_subsampling_factor))
+            allowed_lengths.append(length)
+            fp.write("{}\n".format(int(length)))
+            length *= args.factor
+    return allowed_lengths
+
+
+
+def perturb_utterances(utterances, allowed_durations, args):
+    """Given a set of utterances and a set of allowed durations, generate
+       an extended set of perturbed utterances (all having an allowed duration)
+
+     Returns
+     -------
+     perturbed_utterances: list of pertubed utterances
+    """
+
+    perturbed_utterances = []
+    for u in utterances:
+        # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i]
+        # i = len(allowed_durations) --> no upper bound
+        # i = 0         --> no lower bound
+        if u.dur < allowed_durations[0]:
+            i = 0
+        elif u.dur > allowed_durations[-1]:
+            i = len(allowed_durations)
+        else:
+            i = 1
+            while i < len(allowed_durations):
+                if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]:
+                    break
+                i += 1
+
+        if i > 0 and args.speed_perturb:  # we have a smaller allowed duration
+            allowed_dur = allowed_durations[i - 1]
+            speed = u.dur / allowed_dur
+            if max(speed, 1.0/speed) > args.factor:  # this could happen for very short/long utterances
+                continue
+            u1 = copy.deepcopy(u)
+            u1.id = 'pv1-' + u.id
+            u1.speaker = 'pv1-' + u.speaker
+            u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
+            u1.dur = allowed_dur
+            perturbed_utterances.append(u1)
+
+
+        if i < len(allowed_durations):  # we have a larger allowed duration
+            allowed_dur2 = allowed_durations[i]
+            speed = u.dur / allowed_dur2
+            if max(speed, 1.0/speed) > args.factor:
+                continue
+
+            ## Add two versions for the second allowed_duration
+            ## one version is by using speed modification using sox
+            ## the other is by extending by silence
+            if args.speed_perturb:
+                u2 = copy.deepcopy(u)
+                u2.id = 'pv2-' + u.id
+                u2.speaker = 'pv2-' + u.speaker
+                u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
+                u2.dur = allowed_dur2
+                perturbed_utterances.append(u2)
+
+            delta = allowed_dur2 - u.dur
+            if delta <= 1e-4:
+                continue
+            u3 = copy.deepcopy(u)
+            u3.id = 'pv3-' + u.id
+            u3.speaker = 'pv3-' + u.speaker
+            u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta)
+            u3.dur = allowed_dur2
+            perturbed_utterances.append(u3)
+    return perturbed_utterances
+
+
+
+def main():
+    args = get_args()
+    args.factor = 1.0 + args.factor / 100.0
+
+    image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames.txt'))
+
+    start_dur, end_dur = find_duration_range(image2length, args.coverage_factor)
+    logger.info("Lengths in the range [{},{}] will be covered. "
+                "Coverage rate: {}%".format(start_dur, end_dur,
+                                      100.0 - args.coverage_factor * 2))
+    logger.info("There will be {} unique allowed lengths "
+                "for the images.".format(int(math.log(end_dur / start_dur) /
+                                             math.log(args.factor))))
+
+    allowed_durations = find_allowed_durations(start_dur, end_dur, args)
+
+
+if __name__ == '__main__':
+      main()
diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
new file mode 100755
index 00000000000..eb4d0120658
--- /dev/null
+++ b/egs/cifar/v1/image/get_image2num_frames.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+""" This script converts images to Kaldi-format feature matrices. The input to
+    this script is the path to a data directory, e.g. "data/train". This script
+    reads the images listed in images.scp and writes them to standard output
+    (by default) as Kaldi-formatted matrices (in text form). It also scales the
+    images so they have the same height (via --feat-dim). It can optionally pad
+    the images (on left/right sides) with white pixels.
+
+    eg. local/make_features.py data/train --feat-dim 40
+"""
+
+import argparse
+import os
+import sys
+import numpy as np
+from scipy import misc
+
+parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
+                                                writes them to standard output in text format.""")
+parser.add_argument('dir', type=str,
+                    help='Source data directory (containing images.scp)')
+parser.add_argument('--out-ark', type=str, default=None,
+                    help='Where to write the output image-to-num_frames info.')
+parser.add_argument('--feat-dim', type=int, default=40,
+                    help='Size to scale the height of all images')
+parser.add_argument('--padding', type=int, default=5,
+                    help='Number of white pixels to pad on the left'
+                    'and right side of the image.')
+args = parser.parse_args()
+
+
+def get_scaled_image_length(im):
+    scale_size = args.feat_dim
+    sx = im.shape[1]
+    sy = im.shape[0]
+    scale = (1.0 * scale_size) / sy
+    nx = int(scale * sx)
+    return nx
+
+### main ###
+data_list_path = os.path.join(args.dir,'images.scp')
+
+if not args.out_ark:
+    args.out_ark = os.path.join(args.dir,'image2num_frames.txt')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark, 'w', encoding='latin-1')
+
+with open(data_list_path) as f:
+    for line in f:
+        line = line.strip()
+        line_vect = line.split(' ')
+        image_id = line_vect[0]
+        image_path = line_vect[1]
+        im = misc.imread(image_path)
+        im_len = get_scaled_image_length(im) + (args.padding * 2)
+        print('{} {}'.format(image_id, im_len), file=out_fh)
+
+out_fh.close()
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
new file mode 100755
index 00000000000..2278ab99235
--- /dev/null
+++ b/egs/iam/v1/run_end2end.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+# Copyright 2017    Hossein Hadian
+
+set -e
+stage=0
+nj=20
+username=
+password=
+# iam_database points to the database path on the JHU grid. If you have not
+# already downloaded the database you can set it to a local directory
+# like "data/download" and follow the instructions
+# in "local/prepare_data.sh" to download the database:
+iam_database=/export/corpora5/handwriting_ocr/IAM
+
+. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
+           ## This relates to the queue.
+. ./path.sh
+. ./utils/parse_options.sh  # e.g. this parses the above options
+                            # if supplied.
+
+
+./local/check_tools.sh
+
+if [ $stage -le 0 ]; then
+  echo "$0: Preparing data..."
+  local/prepare_data.sh --download-dir "$iam_database" \
+    --username "$username" --password "$password"
+fi
+mkdir -p data/{train,test}/data
+
+if [ $stage -le 1 ]; then
+  get_image2num_frames.py data/train  # This will be needed for the next command
+  # The next command creates a "allowed_lengths.txt" file in data/train
+  # which will be used by local/make_features.py to enforce the images to
+  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
+  image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
+  echo "$0: Preparing the test and train feature files..."
+  for dataset in train test; do
+    local/make_features.py data/$dataset --feat-dim 40 | \
+      copy-feats --compress=true --compression-method=7 \
+                 ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp
+    steps/compute_cmvn_stats.sh data/$dataset
+  done
+  utils/fix_data_dir.sh data/train
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing dictionary and lang..."
+  local/prepare_dict.sh
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
+                        data/local/dict "<unk>" data/lang/temp data/lang
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  local/train_lm.sh
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \
+                     data/local/dict/lexicon.txt data/lang_test
+fi
+
+
+if [ $stage -le 4 ]; then
+  echo "$0: estimating phone language model for the denominator graph"
+  mkdir -p exp/chain/e2e_base/log
+  $train_cmd exp/chain/e2e_base/log/make_phone_lm.log \
+  cat data/train/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
+    utils/sym2int.pl -f 2- data/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=2000 \
+                       ark:- exp/chain/e2e_base/phone_lm.fst
+fi
+
+if [ $stage -le 5 ]; then
+  echo "$0: calling the flat-start chain recipe..."
+  local/chain/e2e/run_tdnn_flatstart.sh
+fi

From 92a5866f4a108efc03402de15da3b87cbc2e6a95 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 18 Mar 2018 23:23:53 -0400
Subject: [PATCH 2/9] Add e2e chain script

---
 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 153 ++++++++++++++++++
 egs/iam/v1/run_end2end.sh                     |   4 +-
 2 files changed, 155 insertions(+), 2 deletions(-)
 create mode 100755 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh

diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..3dd77a03522
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+
+set -e
+
+# configs for 'chain'
+stage=0
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=2
+num_jobs_final=4
+minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1500000
+cmvn_opts="--norm-means=true --norm-vars=true"
+train_set=train_e2e
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=data/lang_e2e
+treedir=exp/chain/e2e_bitree  # it's actually just a trivial tree (no tree building)
+dir=exp/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type biphone \
+                                       data/$train_set $lang $treedir
+  cp exp/chain/e2e_base/phone_lm.fst $treedir/
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+
+  opts="l2-regularize=0.075"
+  opts_2="l2-regularize=0.075"
+  opts_3="l2-regularize=0.1"
+  common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir data/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 2278ab99235..1b606d0347c 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -62,11 +62,11 @@ fi
 if [ $stage -le 4 ]; then
   echo "$0: estimating phone language model for the denominator graph"
   mkdir -p exp/chain/e2e_base/log
-  $train_cmd exp/chain/e2e_base/log/make_phone_lm.log \
+  $cmd exp/chain/e2e_base/log/make_phone_lm.log \
   cat data/train/text \| \
     steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \
     utils/sym2int.pl -f 2- data/lang/phones.txt \| \
-    chain-est-phone-lm --num-extra-lm-states=2000 \
+    chain-est-phone-lm --num-extra-lm-states=1000 \
                        ark:- exp/chain/e2e_base/phone_lm.fst
 fi
 

From ea839ade679703a9fe52b1b636fcfe8a1c1ec1bb Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Mon, 19 Mar 2018 10:22:46 -0400
Subject: [PATCH 3/9] Some fixes

---
 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 16 +++----
 egs/iam/v1/local/make_features.py             | 43 ++++++++++++++++---
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
index 3dd77a03522..d7c39c78f21 100755
--- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
@@ -8,19 +8,20 @@ set -e
 stage=0
 train_stage=-10
 get_egs_stage=-10
-affix=1a
+affix=1afix1
 
 # training options
 tdnn_dim=450
 num_epochs=4
 num_jobs_initial=2
 num_jobs_final=4
-minibatch_size=150=128,64/300=100,64,32/600=50,32,16/1200=16,8
+minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8
 common_egs_dir=
 l2_regularize=0.00005
-frames_per_iter=1500000
+frames_per_iter=1000000
 cmvn_opts="--norm-means=true --norm-vars=true"
 train_set=train_e2e
+lang_test=lang_test
 
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
@@ -108,6 +109,8 @@ if [ $stage -le 3 ]; then
     --egs.dir "$common_egs_dir" \
     --egs.stage $get_egs_stage \
     --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
     --trainer.num-chunk-per-minibatch $minibatch_size \
     --trainer.frames-per-iter $frames_per_iter \
     --trainer.num-epochs $num_epochs \
@@ -140,12 +143,7 @@ fi
 if [ $stage -le 5 ]; then
   frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
   steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
-    --extra-left-context $chunk_left_context \
-    --extra-right-context $chunk_right_context \
-    --extra-left-context-initial 0 \
-    --extra-right-context-final 0 \
-    --frames-per-chunk $frames_per_chunk \
-    --nj $nj --cmd "$cmd" \
+    --nj 30 --cmd "$cmd" \
     $dir/graph data/test $dir/decode_test || exit 1;
 fi
 
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
index b998464953f..08a6f96b940 100755
--- a/egs/iam/v1/local/make_features.py
+++ b/egs/iam/v1/local/make_features.py
@@ -30,6 +30,10 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
+#parser.add_argument('--img2len-file', type=str, default=None,
+#                    help='If supplied, each images will be padded to reach the '
+#                    'target length (this overrides --padding).')
+
 args = parser.parse_args()
 
 
@@ -49,7 +53,7 @@ def write_kaldi_matrix(file_handle, matrix, key):
             file_handle.write("\n")
     file_handle.write(" ]\n")
 
-def get_scaled_image(im):
+def get_scaled_image(im, allowed_lengths = None):
     scale_size = args.feat_dim
     sx = im.shape[1]
     sy = im.shape[0]
@@ -57,11 +61,24 @@ def get_scaled_image(im):
     nx = int(scale_size)
     ny = int(scale * sx)
     im = misc.imresize(im, (nx, ny))
-    padding_x = args.padding
-    padding_y = im.shape[0]
-    im_pad = np.concatenate((255 * np.ones((padding_y, padding_x),
+    if allowed_lengths is None:
+        left_padding = right_padding = args.padding
+    else:
+        imlen = im.shape[1]
+        allowed_len = 0
+        for l in allowed_lengths:
+            if l > imlen:
+                allowed_len = l
+                break
+        if allowed_len == 0:
+            return None
+        padding = allowed_len - imlen
+        left_padding = padding // 2
+        right_padding = padding - left_padding
+    dim_y = im.shape[0]
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
                                            dtype=int), im), axis=1)
-    im_pad1 = np.concatenate((im_pad, 255 * np.ones((padding_y, padding_x),
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
                                                     dtype=int)), axis=1)
     return im_pad1
 
@@ -73,6 +90,15 @@ def get_scaled_image(im):
 else:
     out_fh = open(args.out_ark,'wb')
 
+allowed_lengths = None
+if os.path.isfile(os.path.join(args.dir,'allowed_lengths.txt')):
+    print("Found 'allowed-lengths' file...", file=sys.stderr)
+    allowed_lengths = []
+    with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
+        for line in f:
+            allowed_lengths.append(int(line.strip()))
+    print("Read {} allowed lengths and will apply them to the features.".format(len(allowed_lengths)), file=sys.stderr)
+
 with open(data_list_path) as f:
     for line in f:
         line = line.strip()
@@ -80,8 +106,11 @@ def get_scaled_image(im):
         image_id = line_vect[0]
         image_path = line_vect[1]
         im = misc.imread(image_path)
-        im_scale = get_scaled_image(im)
+        im_scaled = get_scaled_image(im, allowed_lengths)
 
-        data = np.transpose(im_scale, (1, 0))
+        if im_scaled is None:
+            print('Image scaling failed: {} '.format(image_id), file=sys.stderr)
+            continue
+        data = np.transpose(im_scaled, (1, 0))
         data = np.divide(data, 255.0)
         write_kaldi_matrix(out_fh, data, image_id)

From f5cbb24277a2c02c49260b6c4476e8e2945826ec Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Mon, 19 Mar 2018 14:52:47 -0400
Subject: [PATCH 4/9] Some cleaning

---
 egs/cifar/v1/image/get_allowed_lengths.py | 168 ----------------------
 egs/iam/v1/run_end2end.sh                 |   4 +-
 2 files changed, 2 insertions(+), 170 deletions(-)

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
index 07db16e2238..668d4cb493d 100755
--- a/egs/cifar/v1/image/get_allowed_lengths.py
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -49,68 +49,6 @@ def get_args():
     args = parser.parse_args()
     return args
 
-class Utterance:
-    """ This class represents a Kaldi utterance
-        in a data directory like data/train
-    """
-
-    def __init__(self, uid, wavefile, speaker, transcription, dur):
-        self.wavefile = (wavefile if wavefile.rstrip().endswith('|') else
-                         'cat {} |'.format(wavefile))
-        self.speaker = speaker
-        self.transcription = transcription
-        self.id = uid
-        self.dur = float(dur)
-
-    def to_kaldi_utt_str(self):
-        return self.id + " " + self.transcription
-
-    def to_kaldi_wave_str(self):
-        return self.id + " " + self.wavefile
-
-    def to_kaldi_dur_str(self):
-        return "{} {:0.3f}".format(self.id, self.dur)
-
-
-def read_kaldi_datadir(dir):
-    """ Read a data directory like
-        data/train as a list of utterances
-    """
-
-    # check to make sure that no segments file exists as this script won't work
-    # with data directories which use a segments file.
-    if os.path.isfile(os.path.join(dir, 'segments')):
-        logger.info("The data directory '{}' seems to use a 'segments' file. "
-                    "This script does not yet support a 'segments' file. You'll need "
-                    "to use utils/data/extract_wav_segments_data_dir.sh "
-                    "to convert the data dir so it does not use a 'segments' file. "
-                    "Exiting...".format(dir))
-        sys.exit(1)
-
-    logger.info("Loading the data from {}...".format(dir))
-    utterances = []
-    wav_scp = read_kaldi_mapfile(os.path.join(dir, 'wav.scp'))
-    text = read_kaldi_mapfile(os.path.join(dir, 'text'))
-    utt2dur = read_kaldi_mapfile(os.path.join(dir, 'utt2dur'))
-    utt2spk = read_kaldi_mapfile(os.path.join(dir, 'utt2spk'))
-
-    num_fail = 0
-    for utt in wav_scp:
-        if utt in text and utt in utt2dur and utt in utt2spk:
-            utterances.append(Utterance(utt, wav_scp[utt], utt2spk[utt],
-                                  text[utt], utt2dur[utt]))
-        else:
-            num_fail += 1
-
-    if len(utterances) / len(wav_scp) < 0.5:
-        logger.info("More than half your data is problematic. Try "
-                    "fixing using fix_data_dir.sh.")
-        sys.exit(1)
-
-    logger.info("Successfully read {} utterances. Failed for {} "
-                "utterances.".format(len(utterances), num_fail))
-    return utterances
-
 
 def read_kaldi_mapfile(path):
     """ Read any Kaldi mapping file - like text, .scp files, etc.
@@ -126,44 +64,6 @@ def read_kaldi_mapfile(path):
             m[key] = val
     return m
 
-def generate_kaldi_data_files(utterances, outdir):
-    """ Write out a list of utterances as Kaldi data files into an
-        output data directory.
-    """
-
-    logger.info("Exporting to {}...".format(outdir))
-    speakers = {}
-
-    with open(os.path.join(outdir, 'text'), 'w') as f:
-        for utt in utterances:
-            f.write(utt.to_kaldi_utt_str() + "\n")
-
-    with open(os.path.join(outdir, 'wav.scp'), 'w') as f:
-        for utt in utterances:
-            f.write(utt.to_kaldi_wave_str() + "\n")
-
-    with open(os.path.join(outdir, 'utt2dur'), 'w') as f:
-        for utt in utterances:
-            f.write(utt.to_kaldi_dur_str() + "\n")
-
-    with open(os.path.join(outdir, 'utt2spk'), 'w') as f:
-        for utt in utterances:
-            f.write(utt.id + " " + utt.speaker + "\n")
-            if utt.speaker not in speakers:
-                speakers[utt.speaker] = [utt.id]
-            else:
-                speakers[utt.speaker].append(utt.id)
-
-    with open(os.path.join(outdir, 'spk2utt'), 'w') as f:
-        for s in speakers:
-            f.write(s + " ")
-            for utt in speakers[s]:
-                f.write(utt + " ")
-            f.write('\n')
-
-    logger.info("Successfully wrote {} utterances to data "
-                "directory '{}'".format(len(utterances), outdir))
-
 def find_duration_range(img2len, coverage_factor):
     """Given a list of utterances, find the start and end duration to cover
 
@@ -223,74 +123,6 @@ def find_allowed_durations(start_len, end_len, args):
 
 
 
-def perturb_utterances(utterances, allowed_durations, args):
-    """Given a set of utterances and a set of allowed durations, generate
-       an extended set of perturbed utterances (all having an allowed duration)
-
-     Returns
-     -------
-     perturbed_utterances: list of pertubed utterances
-    """
-
-    perturbed_utterances = []
-    for u in utterances:
-        # find i such that: allowed_durations[i-1] <= u.dur <= allowed_durations[i]
-        # i = len(allowed_durations) --> no upper bound
-        # i = 0         --> no lower bound
-        if u.dur < allowed_durations[0]:
-            i = 0
-        elif u.dur > allowed_durations[-1]:
-            i = len(allowed_durations)
-        else:
-            i = 1
-            while i < len(allowed_durations):
-                if u.dur <= allowed_durations[i] and u.dur >= allowed_durations[i - 1]:
-                    break
-                i += 1
-
-        if i > 0 and args.speed_perturb:  # we have a smaller allowed duration
-            allowed_dur = allowed_durations[i - 1]
-            speed = u.dur / allowed_dur
-            if max(speed, 1.0/speed) > args.factor:  # this could happen for very short/long utterances
-                continue
-            u1 = copy.deepcopy(u)
-            u1.id = 'pv1-' + u.id
-            u1.speaker = 'pv1-' + u.speaker
-            u1.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
-            u1.dur = allowed_dur
-            perturbed_utterances.append(u1)
-
-
-        if i < len(allowed_durations):  # we have a larger allowed duration
-            allowed_dur2 = allowed_durations[i]
-            speed = u.dur / allowed_dur2
-            if max(speed, 1.0/speed) > args.factor:
-                continue
-
-            ## Add two versions for the second allowed_duration
-            ## one version is by using speed modification using sox
-            ## the other is by extending by silence
-            if args.speed_perturb:
-                u2 = copy.deepcopy(u)
-                u2.id = 'pv2-' + u.id
-                u2.speaker = 'pv2-' + u.speaker
-                u2.wavefile = '{} sox -t wav - -t wav - speed {} | '.format(u.wavefile, speed)
-                u2.dur = allowed_dur2
-                perturbed_utterances.append(u2)
-
-            delta = allowed_dur2 - u.dur
-            if delta <= 1e-4:
-                continue
-            u3 = copy.deepcopy(u)
-            u3.id = 'pv3-' + u.id
-            u3.speaker = 'pv3-' + u.speaker
-            u3.wavefile = '{} extend-wav-with-silence --extra-silence-length={} - - | '.format(u.wavefile, delta)
-            u3.dur = allowed_dur2
-            perturbed_utterances.append(u3)
-    return perturbed_utterances
-
-
-
 def main():
     args = get_args()
     args.factor = 1.0 + args.factor / 100.0
diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh
index 1b606d0347c..d479bfa2a73 100755
--- a/egs/iam/v1/run_end2end.sh
+++ b/egs/iam/v1/run_end2end.sh
@@ -47,7 +47,7 @@ fi
 if [ $stage -le 2 ]; then
   echo "$0: Preparing dictionary and lang..."
   local/prepare_dict.sh
-  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
+  utils/prepare_lang.sh --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
 fi
 
@@ -72,5 +72,5 @@ fi
 
 if [ $stage -le 5 ]; then
   echo "$0: calling the flat-start chain recipe..."
-  local/chain/e2e/run_tdnn_flatstart.sh
+  local/chain/run_flatstart_cnn1a.sh
 fi

From 871625b3f731be47bf920658bbe412137b9ef391 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Mon, 26 Mar 2018 15:12:34 -0400
Subject: [PATCH 5/9] Fix the docs.

---
 egs/cifar/v1/image/get_allowed_lengths.py  | 17 ++++++++---------
 egs/cifar/v1/image/get_image2num_frames.py | 22 ++++++++++------------
 2 files changed, 18 insertions(+), 21 deletions(-)

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
index 668d4cb493d..2e7996c4d77 100755
--- a/egs/cifar/v1/image/get_allowed_lengths.py
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -4,8 +4,11 @@
 # Apache 2.0
 
 
-""" This script perturbs speeds of utterances to force their lengths to some
-    allowed lengths spaced by a factor (like 10%)
+""" This script finds a set of allowed lengths for a given OCR/HWR data dir.
+    The allowed lengths are spaced by a factor (like 10%) and are written
+    in an output file named "allowed_lengths.txt" in the output data dir. This
+    file is later used by make_features.py to pad each image sufficiently so that
+    they all have an allowed length. This is intended for end2end chain training.
 """
 
 import argparse
@@ -28,13 +31,9 @@
 logger.addHandler(handler)
 
 def get_args():
-    parser = argparse.ArgumentParser(description="""This script copies the 'srcdir'
-                                   data directory to output data directory 'dir'
-                                   while modifying the utterances so that there are
-                                   3 copies of each utterance: one with the same
-                                   speed, one with a higher speed (not more than
-                                   factor% faster) and one with a lower speed
-                                   (not more than factor% slower)""")
+    parser = argparse.ArgumentParser(description="""This script finds a set of
+                                   allowed lengths for a given OCR/HWR data dir.
+                                   Intended for chain training."""
     parser.add_argument('factor', type=float, default=12,
                         help='Spacing (in percentage) between allowed lengths.')
     parser.add_argument('srcdir', type=str,
diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
index eb4d0120658..91a7e34dae5 100755
--- a/egs/cifar/v1/image/get_image2num_frames.py
+++ b/egs/cifar/v1/image/get_image2num_frames.py
@@ -1,16 +1,13 @@
 #!/usr/bin/env python3
 
-# Copyright      2017  Chun Chieh Chang
-#                2017  Ashish Arora
+# Copyright      2018  Hossein Hadian
 
-""" This script converts images to Kaldi-format feature matrices. The input to
-    this script is the path to a data directory, e.g. "data/train". This script
-    reads the images listed in images.scp and writes them to standard output
-    (by default) as Kaldi-formatted matrices (in text form). It also scales the
-    images so they have the same height (via --feat-dim). It can optionally pad
-    the images (on left/right sides) with white pixels.
 
-    eg. local/make_features.py data/train --feat-dim 40
+""" This script computes the image lengths (with padding) in an image data dir.
+    The output is written to 'image2num_frames.txt' in the given data dir. This
+    file is later used by image/get_allowed_lengths.py to find a set of allowed lengths
+    for the data dir. The output format is similar to utt2num_frames
+
 """
 
 import argparse
@@ -19,12 +16,13 @@
 import numpy as np
 from scipy import misc
 
-parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and
-                                                writes them to standard output in text format.""")
+parser = argparse.ArgumentParser(description="""Computes the image lengths (with padding) in an image data dir
+                                                and writes them (by default) to image2num_frames.txt.""")
 parser.add_argument('dir', type=str,
                     help='Source data directory (containing images.scp)')
 parser.add_argument('--out-ark', type=str, default=None,
-                    help='Where to write the output image-to-num_frames info.')
+                    help='Where to write the output image-to-num_frames info. '
+                    'Default: dir/image2num_frames.txt')
 parser.add_argument('--feat-dim', type=int, default=40,
                     help='Size to scale the height of all images')
 parser.add_argument('--padding', type=int, default=5,

From 95958ce8ab53222df0c58441be3770a3775b5cca Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Mon, 26 Mar 2018 15:16:22 -0400
Subject: [PATCH 6/9] Add more docs

---
 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
index d7c39c78f21..1f2d1755fe2 100755
--- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 # Copyright    2017  Hossein Hadian
 
+# This script does end2end chain training (i.e. from scratch)
 
 set -e
 
@@ -8,7 +9,7 @@ set -e
 stage=0
 train_stage=-10
 get_egs_stage=-10
-affix=1afix1
+affix=1a
 
 # training options
 tdnn_dim=450

From f781feed8f087e03efc99dad40f4936b23b4f83e Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Mon, 26 Mar 2018 19:05:59 -0400
Subject: [PATCH 7/9] Some more cleaning + results

---
 egs/cifar/v1/image/get_image2num_frames.py    |  2 +-
 egs/iam/v1/local/chain/run_flatstart_cnn1a.sh | 13 +++++++++
 egs/iam/v1/local/make_features.py             | 29 +++++++++++++------
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
index 91a7e34dae5..5ebd5d15a9f 100755
--- a/egs/cifar/v1/image/get_image2num_frames.py
+++ b/egs/cifar/v1/image/get_image2num_frames.py
@@ -16,7 +16,7 @@
 import numpy as np
 from scipy import misc
 
-parser = argparse.ArgumentParser(description="""Computes the image lengths (with padding) in an image data dir
+parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir
                                                 and writes them (by default) to image2num_frames.txt.""")
 parser.add_argument('dir', type=str,
                     help='Source data directory (containing images.scp)')
diff --git a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
index 1f2d1755fe2..65eeedcc75b 100755
--- a/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/iam/v1/local/chain/run_flatstart_cnn1a.sh
@@ -3,6 +3,19 @@
 
 # This script does end2end chain training (i.e. from scratch)
 
+# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a
+# System                         cnn_1a cnn_chainali_1c e2e_cnn_1a
+# WER                             18.58     12.84     15.46
+# CER                             10.17      6.40      7.21
+# Final train prob              -0.0122   -0.0120   -0.0426
+# Final valid prob              -0.0999   -0.0199   -0.0724
+# Final train prob (xent)       -0.5652   -0.9973
+# Final valid prob (xent)       -0.9758   -1.1537
+# Parameters                      4.36M     3.96M     9.13M
+
+# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
+# exp/chain/e2e_cnn_1a/: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.040->-0.040 (over 1) logprob:train/valid[13,20,final]=(-0.065,-0.046,-0.043/-0.081,-0.073,-0.072)
+
 set -e
 
 # configs for 'chain'
diff --git a/egs/iam/v1/local/make_features.py b/egs/iam/v1/local/make_features.py
index 08a6f96b940..8cfca5ee830 100755
--- a/egs/iam/v1/local/make_features.py
+++ b/egs/iam/v1/local/make_features.py
@@ -2,6 +2,7 @@
 
 # Copyright      2017  Chun Chieh Chang
 #                2017  Ashish Arora
+#                2018  Hossein Hadian
 
 """ This script converts images to Kaldi-format feature matrices. The input to
     this script is the path to a data directory, e.g. "data/train". This script
@@ -9,6 +10,10 @@
     (by default) as Kaldi-formatted matrices (in text form). It also scales the
     images so they have the same height (via --feat-dim). It can optionally pad
     the images (on left/right sides) with white pixels.
+    If an 'image2num_frames' file is found in the data dir, it will be used
+    to enforce the images to have the specified length in that file by padding
+    white pixels (the --padding option will be ignored in this case). This relates
+    to end2end chain training.
 
     eg. local/make_features.py data/train --feat-dim 40
 """
@@ -30,9 +35,7 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
-#parser.add_argument('--img2len-file', type=str, default=None,
-#                    help='If supplied, each images will be padded to reach the '
-#                    'target length (this overrides --padding).')
+
 
 args = parser.parse_args()
 
@@ -63,7 +66,7 @@ def get_scaled_image(im, allowed_lengths = None):
     im = misc.imresize(im, (nx, ny))
     if allowed_lengths is None:
         left_padding = right_padding = args.padding
-    else:
+    else:  # Find an allowed length for the image
         imlen = im.shape[1]
         allowed_len = 0
         for l in allowed_lengths:
@@ -71,6 +74,7 @@ def get_scaled_image(im, allowed_lengths = None):
                 allowed_len = l
                 break
         if allowed_len == 0:
+            #  No allowed length was found for the image (the image is too long)
             return None
         padding = allowed_len - imlen
         left_padding = padding // 2
@@ -83,7 +87,7 @@ def get_scaled_image(im, allowed_lengths = None):
     return im_pad1
 
 ### main ###
-data_list_path = os.path.join(args.dir,'images.scp')
+data_list_path = os.path.join(args.dir, 'images.scp')
 
 if args.out_ark == '-':
     out_fh = sys.stdout
@@ -91,14 +95,17 @@ def get_scaled_image(im, allowed_lengths = None):
     out_fh = open(args.out_ark,'wb')
 
 allowed_lengths = None
-if os.path.isfile(os.path.join(args.dir,'allowed_lengths.txt')):
-    print("Found 'allowed-lengths' file...", file=sys.stderr)
+if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')):
+    print("Found 'allowed_lengths.txt' file...", file=sys.stderr)
     allowed_lengths = []
     with open(os.path.join(args.dir,'allowed_lengths.txt')) as f:
         for line in f:
             allowed_lengths.append(int(line.strip()))
-    print("Read {} allowed lengths and will apply them to the features.".format(len(allowed_lengths)), file=sys.stderr)
+    print("Read {} allowed lengths and will apply them to the "
+          "features.".format(len(allowed_lengths)), file=sys.stderr)
 
+num_fail = 0
+num_ok = 0
 with open(data_list_path) as f:
     for line in f:
         line = line.strip()
@@ -109,8 +116,12 @@ def get_scaled_image(im, allowed_lengths = None):
         im_scaled = get_scaled_image(im, allowed_lengths)
 
         if im_scaled is None:
-            print('Image scaling failed: {} '.format(image_id), file=sys.stderr)
+            num_fail += 1
             continue
         data = np.transpose(im_scaled, (1, 0))
         data = np.divide(data, 255.0)
+        num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
+
+print('Generated features for {} images. Failed for {} (iamge too '
+      'long).'.format(num_ok, num_fail))

From 861ecbaa1abf38cfe3ce2e72cc06f9b9f6df1f63 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 27 Mar 2018 16:18:23 -0400
Subject: [PATCH 8/9] Add new chain recipe with l2-regularize (by @aarora8)

---
 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh | 246 ++++++++++++++++++
 1 file changed, 246 insertions(+)
 create mode 100755 egs/iam/v1/local/chain/run_cnn_chainali_1c.sh

diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
new file mode 100755
index 00000000000..6ff76490303
--- /dev/null
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1c.sh
@@ -0,0 +1,246 @@
+#!/bin/bash
+
+# chainali_1c is as chainali_1b except it uses l2-regularize
+# local/chain/compare_wer.sh exp/chain/cnn_chainali_1b exp/chain/cnn_chainali_1c
+# System                      cnn_chainali_1b cnn_chainali_1c
+# WER                             14.67     12.84
+# CER                              7.31      6.40
+# Final train prob               0.0042   -0.0120
+# Final valid prob              -0.0256   -0.0199
+# Final train prob (xent)       -0.6282   -0.9973
+# Final valid prob (xent)       -0.9096   -1.1537
+# Parameters                      3.96M     3.96M
+
+# steps/info/chain_dir_info.pl exp/chain/cnn_chainali_1c
+# exp/chain/cnn_chainali_1c: num-iters=21 nj=2..4 num-params=4.0M dim=40->369 combine=-0.007->-0.007 (over 1) xent:train/valid[13,20,final]=(-1.44,-1.05,-0.997/-1.53,-1.19,-1.15) logprob:train/valid[13,20,final]=(-0.056,-0.020,-0.012/-0.056,-0.025,-0.020)
+
+set -e -o pipefail
+
+stage=0
+
+nj=30
+train_set=train
+gmm=tri3        # this is the source gmm-dir that we'll use for alignments; it
+                # should have alignments for the specified training data.
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1c  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+ali=tri3_ali
+chain_model_dir=exp/chain${nnet3_affix}/cnn${affix}
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+alignment_subsampling_factor=1
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=false
+lang_test=lang_unk
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+gmm_dir=exp/${gmm}
+ali_dir=exp/${ali}
+lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats_chain
+gmm_lat_dir=exp/chain${nnet3_affix}/${gmm}_${train_set}_lats
+dir=exp/chain${nnet3_affix}/cnn_chainali${affix}
+train_data_dir=data/${train_set}
+tree_dir=exp/chain${nnet3_affix}/tree_chain
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=data/lang_chain
+for f in $train_data_dir/feats.scp \
+    $train_data_dir/feats.scp $gmm_dir/final.mdl \
+    $ali_dir/ali.1.gz $gmm_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt data/$lang_test/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r data/$lang_test $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} data/$lang_test $chain_model_dir $lat_dir
+  cp $gmm_lat_dir/splice_opts $lat_dir/splice_opts
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+   if [ -f $tree_dir/final.mdl ]; then
+     echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+     exit 1;
+  fi
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  opts="l2-regularize=0.075"
+  opts_2="l2-regularize=0.075"
+  opts_3="l2-regularize=0.1"
+  common1="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36"
+  common2="$opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70"
+  common3="$opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=40 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $opts_2
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $opts_3
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $opts_2
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $opts_3
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=$alignment_subsampling_factor \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=4 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=2 \
+    --trainer.optimization.num-jobs-final=4 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=64,32 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 data/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph data/test $dir/decode_test || exit 1;
+fi

From 050bc1c25cc5e9b997b9440dfb157244412103db Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Tue, 27 Mar 2018 16:44:27 -0400
Subject: [PATCH 9/9] Update to python3 + rename image2num_frames.txt to
 image2num_frames (to be consistent with utt2num_frames)

---
 egs/cifar/v1/image/get_allowed_lengths.py  | 10 +++++-----
 egs/cifar/v1/image/get_image2num_frames.py |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py
index 2e7996c4d77..02321fdd2df 100755
--- a/egs/cifar/v1/image/get_allowed_lengths.py
+++ b/egs/cifar/v1/image/get_allowed_lengths.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright     2017  Hossein Hadian
 # Apache 2.0
@@ -33,7 +33,7 @@
 def get_args():
     parser = argparse.ArgumentParser(description="""This script finds a set of
                                    allowed lengths for a given OCR/HWR data dir.
-                                   Intended for chain training."""
+                                   Intended for chain training.""")
     parser.add_argument('factor', type=float, default=12,
                         help='Spacing (in percentage) between allowed lengths.')
     parser.add_argument('srcdir', type=str,
@@ -54,7 +54,7 @@ def read_kaldi_mapfile(path):
     """
 
     m = {}
-    with open(path, 'r') as f:
+    with open(path, 'r', encoding='latin-1') as f:
         for line in f:
             line = line.strip()
             sp_pos = line.find(' ')
@@ -110,7 +110,7 @@ def find_allowed_durations(start_len, end_len, args):
 
     allowed_lengths = []
     length = start_len
-    with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'wb') as fp:
+    with open(os.path.join(args.srcdir, 'allowed_lengths.txt'), 'w', encoding='latin-1') as fp:
         while length < end_len:
             if length % args.frame_subsampling_factor != 0:
                 length = (args.frame_subsampling_factor *
@@ -126,7 +126,7 @@ def main():
     args = get_args()
     args.factor = 1.0 + args.factor / 100.0
 
-    image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames.txt'))
+    image2length = read_kaldi_mapfile(os.path.join(args.srcdir, 'image2num_frames'))
 
     start_dur, end_dur = find_duration_range(image2length, args.coverage_factor)
     logger.info("Lengths in the range [{},{}] will be covered. "
diff --git a/egs/cifar/v1/image/get_image2num_frames.py b/egs/cifar/v1/image/get_image2num_frames.py
index 5ebd5d15a9f..3c003bb9947 100755
--- a/egs/cifar/v1/image/get_image2num_frames.py
+++ b/egs/cifar/v1/image/get_image2num_frames.py
@@ -4,7 +4,7 @@
 
 
 """ This script computes the image lengths (with padding) in an image data dir.
-    The output is written to 'image2num_frames.txt' in the given data dir. This
+    The output is written to 'image2num_frames' in the given data dir. This
     file is later used by image/get_allowed_lengths.py to find a set of allowed lengths
     for the data dir. The output format is similar to utt2num_frames
 
@@ -17,12 +17,12 @@
 from scipy import misc
 
 parser = argparse.ArgumentParser(description="""Computes the image lengths (i.e. width) in an image data dir
-                                                and writes them (by default) to image2num_frames.txt.""")
+                                                and writes them (by default) to image2num_frames.""")
 parser.add_argument('dir', type=str,
                     help='Source data directory (containing images.scp)')
 parser.add_argument('--out-ark', type=str, default=None,
                     help='Where to write the output image-to-num_frames info. '
-                    'Default: dir/image2num_frames.txt')
+                    'Default: "dir"/image2num_frames')
 parser.add_argument('--feat-dim', type=int, default=40,
                     help='Size to scale the height of all images')
 parser.add_argument('--padding', type=int, default=5,
@@ -43,7 +43,7 @@ def get_scaled_image_length(im):
 data_list_path = os.path.join(args.dir,'images.scp')
 
 if not args.out_ark:
-    args.out_ark = os.path.join(args.dir,'image2num_frames.txt')
+    args.out_ark = os.path.join(args.dir,'image2num_frames')
 if args.out_ark == '-':
     out_fh = sys.stdout
 else: