diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py
index 7ab75498277..07f3cb12257 100755
--- a/egs/cifar/v1/image/ocr/make_features.py
+++ b/egs/cifar/v1/image/ocr/make_features.py
@@ -43,6 +43,8 @@
 parser.add_argument('--padding', type=int, default=5,
                     help='Number of white pixels to pad on the left'
                     'and right side of the image.')
+parser.add_argument('--num-channels', type=int, default=1,
+                    help='Number of color channels')
 parser.add_argument('--fliplr', type=lambda x: (str(x).lower()=='true'), default=False,
                    help="Flip the image left-right for right to left languages")
 parser.add_argument("--augment", type=lambda x: (str(x).lower()=='true'), default=False,
@@ -84,9 +86,9 @@ def horizontal_pad(im, allowed_lengths = None):
         left_padding = int(padding // 2)
         right_padding = padding - left_padding
     dim_y = im.shape[0] # height
-    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding),
+    im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels),
                                            dtype=int), im), axis=1)
-    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding),
+    im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels),
                                                     dtype=int)), axis=1)
     return im_pad1
 
@@ -150,7 +152,13 @@ def get_scaled_image_aug(im, mode='normal'):
         if im_horizontal_padded is None:
             num_fail += 1
             continue
-        data = np.transpose(im_horizontal_padded, (1, 0))
+        if args.num_channels == 1:
+            data = np.transpose(im_horizontal_padded, (1, 0))
+        elif args.num_channels == 3:
+            H = im_horizontal_padded.shape[0]
+            W = im_horizontal_padded.shape[1]
+            C = im_horizontal_padded.shape[2]
+            data = np.reshape(np.transpose(im_horizontal_padded, (1, 0, 2)), (W, H * C))
         data = np.divide(data, 255.0)
         num_ok += 1
         write_kaldi_matrix(out_fh, data, image_id)
diff --git a/egs/yomdle_fa/README.txt b/egs/yomdle_fa/README.txt
new file mode 100644
index 00000000000..984ffdb53b5
--- /dev/null
+++ b/egs/yomdle_fa/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various newswires (e.g. Hamshahri)
diff --git a/egs/yomdle_fa/v1/cmd.sh b/egs/yomdle_fa/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/yomdle_fa/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/yomdle_fa/v1/image b/egs/yomdle_fa/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_fa/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_fa/v1/local/augment_data.sh b/egs/yomdle_fa/v1/local/augment_data.sh
new file mode 100755
index 00000000000..34e938db069
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/augment_data.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr $fliplr --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_fa/v1/local/bidi.py b/egs/yomdle_fa/v1/local/bidi.py
new file mode 100755
index 00000000000..447313a5d02
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/bidi.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script is largely written by Stephen Rawls
+# and uses the python package https://pypi.org/project/PyICU_BiDi/
+# The code leaves right to left text alone and reverses left to right text.
+
+import icu_bidi
+import io
+import sys
+import unicodedata
+# R=strong right-to-left;  AL=strong arabic right-to-left
+rtl_set =  set(chr(i) for i in range(sys.maxunicode)
+               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
+def determine_text_direction(text):
+    # Easy case first
+    for char in text:
+        if char in rtl_set:
+            return icu_bidi.UBiDiLevel.UBIDI_RTL
+    # If we made it here we did not encounter any strongly rtl char
+    return icu_bidi.UBiDiLevel.UBIDI_LTR
+
+def utf8_visual_to_logical(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+    bidi.inverse = True
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+def utf8_logical_to_visual(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+
+##main##
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = utf8_logical_to_visual(line)[::-1]
+    sys.stdout.write(line + '\n')
diff --git a/egs/yomdle_fa/v1/local/chain/compare_wer.sh b/egs/yomdle_fa/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ab880c1adb5
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/compare_wer.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..e7c125d16de
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# local/chain/compare_wer.sh scale_baseline2/exp_yomdle_farsi/chain/e2e_cnn_1a scale_baseline2/exp_yomdle_farsi/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             19.55     18.45
+# CER                              5.64      4.94
+# Final train prob              -0.0065   -0.0633
+# Final valid prob               0.0015   -0.0619
+# Final train prob (xent)                 -0.2636
+# Final valid prob (xent)                 -0.2511
+
+set -e -o pipefail
+
+data_dir=data
+exp_dir=exp
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=500
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+e2echain_model_dir=$exp_dir/chain/e2e_cnn_1a
+ali_dir=$exp_dir/chain/e2e_ali_train
+lat_dir=$exp_dir/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=$exp_dir/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=$data_dir/${train_set}
+tree_dir=$exp_dir/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=$data_dir/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt $data_dir/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $data_dir/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=2 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=196"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=120 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=16 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=4 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=32,16 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..bb5352943f6
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# local/chain/compare_wer.sh exp_yomdle_farsi/chain/e2e_cnn_1a exp_yomdle_farsi/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# WER                             19.55     18.45
+# CER                              5.64      4.94
+# Final train prob              -0.0065   -0.0633
+# Final valid prob               0.0015   -0.0619
+# Final train prob (xent)                 -0.2636
+# Final valid prob (xent)                 -0.2511
+
+set -e
+
+data_dir=data
+exp_dir=exp
+
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=4
+num_jobs_final=8
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=$data_dir/lang_e2e
+treedir=$exp_dir/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=$exp_dir/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       $data_dir/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat $data_dir/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \
+    utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=72"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=144"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=144"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=120 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $data_dir/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/yomdle_fa/v1/local/create_download.sh b/egs/yomdle_fa/v1/local/create_download.sh
new file mode 100755
index 00000000000..1040ecc2165
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/create_download.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Copyright 2018 Chun-Chieh Chang
+
+# The original format of the dataset given is GEDI and page images.
+# This script is written to create line images from page images.
+# It also creates csv files from the GEDI files.
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+slam_dir=download/slam_farsi
+yomdle_dir=download/yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1; 
+
+echo "$0: Processing SLAM ${language}"
+echo "Date: $(date)."
+mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/GEDI2CSV_enriched.py \
+    --inputDir ${database_slam} \
+    --outputDir ${slam_dir}/truth_csv_raw \
+    --log ${slam_dir}/GEDI2CSV_enriched.log
+local/create_line_image_from_page_image.py \
+    ${database_slam} \
+    ${slam_dir}/truth_csv_raw \
+    ${slam_dir}
+
+echo "$0: Processing YOMDLE ${language}"
+echo "Date: $(date)."
+mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/YOMDLE2CSV.py \
+    --inputDir ${database_yomdle} \
+    --outputDir ${yomdle_dir}/truth_csv_raw/ \
+    --log ${yomdle_dir}/YOMDLE2CSV.log
+local/create_line_image_from_page_image.py \
+    --im-format "jpg" \
+    ${database_yomdle}/images \
+    ${yomdle_dir}/truth_csv_raw \
+    ${yomdle_dir}
diff --git a/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..77a6791d5d7
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/create_line_image_from_page_image.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+# Apache 2.0
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import argparse
+import csv
+import itertools
+import sys
+import os
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+
+parser = argparse.ArgumentParser(description="Creates line images from page image")
+parser.add_argument('image_dir', type=str, help='Path to full page images')
+parser.add_argument('csv_dir', type=str, help='Path to csv files')
+parser.add_argument('out_dir', type=str, help='Path to output directory')
+parser.add_argument('--im-format', type=str, default='png', help='What file format are the images')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+def get_orientation(origin, p1, p2):
+    """
+    Given origin and two points, return the orientation of the Point p1 with
+    regards to Point p2 using origin.
+    Returns
+    -------
+    integer: Negative if p1 is clockwise of p2.
+    """
+    difference = (
+        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
+        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
+    )
+    return difference
+
+
+def compute_hull(points):
+    """
+    Given input list of points, return a list of points that
+    made up the convex hull.
+    Returns
+    -------
+    [(float, float)]: convexhull points
+    """
+    hull_points = []
+    start = points[0]
+    min_x = start[0]
+    for p in points[1:]:
+        if p[0] < min_x:
+            min_x = p[0]
+            start = p
+
+    point = start
+    hull_points.append(start)
+
+    far_point = None
+    while far_point is not start:
+        p1 = None
+        for p in points:
+            if p is point:
+                continue
+            else:
+                p1 = p
+                break
+
+        far_point = p1
+
+        for p2 in points:
+            if p2 is point or p2 is p1:
+                continue
+            else:
+                direction = get_orientation(point, far_point, p2)
+                if direction > 0:
+                    far_point = p2
+
+        hull_points.append(far_point)
+        point = far_point
+    return hull_points
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    #hull_ordered = compute_hull(points)
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = im.size[0] / 2
+    center_y = im.size[1] / 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+### main ###
+csv_count = 0
+for filename in sorted(os.listdir(args.csv_dir)):
+    if filename.endswith('.csv') and (csv_count < args.head or args.head < 0):
+        csv_count = csv_count + 1
+        with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f:
+            image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format)
+            if not os.path.isfile(image_file):
+                continue
+            csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename)
+            csv_out_fh = open(csv_out_file, 'w', encoding='utf-8')
+            csv_out_writer = csv.writer(csv_out_fh)
+            im = Image.open(image_file)
+            im = pad_image(im)
+            count = 1
+            for row in itertools.islice(csv.reader(f), 0, None):
+                if count == 1:
+                    count = 0
+                    continue
+    
+                points = []
+                points.append((int(row[2]), int(row[3])))
+                points.append((int(row[4]), int(row[5])))
+                points.append((int(row[6]), int(row[7])))
+                points.append((int(row[8]), int(row[9])))
+    
+                x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
+                y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])]
+                min_x, min_y = min(x), min(y)
+                max_x, max_y = max(x), max(y)
+                if min_x == max_x or min_y == max_y:
+                    continue
+    
+                try:
+                    updated_mbb_input = update_minimum_bounding_box_input(points)
+                    bounding_box = minimum_bounding_box(updated_mbb_input)
+                except Exception as e:
+                    print("Error: Skipping Image " + row[1])
+                    continue
+    
+                p1, p2, p3, p4 = bounding_box.corner_points
+                x1, y1 = p1
+                x2, y2 = p2
+                x3, y3 = p3
+                x4, y4 = p4
+                min_x = int(min(x1, x2, x3, x4))
+                min_y = int(min(y1, y2, y3, y4))
+                max_x = int(max(x1, x2, x3, x4))
+                max_y = int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im.crop(box)
+                rot_points = []
+                p1_new = (x1 - min_x, y1 - min_y)
+                p2_new = (x2 - min_x, y2 - min_y)
+                p3_new = (x3 - min_x, y3 - min_y)
+                p4_new = (x4 - min_x, y4 - min_y)
+                rot_points.append(p1_new)
+                rot_points.append(p2_new)
+                rot_points.append(p3_new)
+                rot_points.append(p4_new)
+    
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                        bounding_box.length_parallel,
+                        bounding_box.length_orthogonal,
+                        bounding_box.length_orthogonal,
+                        bounding_box.unit_vector,
+                        bounding_box.unit_vector_angle,
+                        set(rot_points))
+    
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+    
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                csv_out_writer.writerow(row)
+                image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1])
+                region_final.save(image_out_file)
diff --git a/egs/yomdle_fa/v1/local/extract_features.sh b/egs/yomdle_fa/v1/local/extract_features.sh
new file mode 100755
index 00000000000..7d6806a2712
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/extract_features.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+augment=false
+num_channels=3
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_fa/v1/local/gedi2csv.py b/egs/yomdle_fa/v1/local/gedi2csv.py
new file mode 100755
index 00000000000..43a07421dd1
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/gedi2csv.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id= col= row= width= height= Language= Quality= Overlay= Script= Type= Text_Content=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+    
+class GEDI2CSV():
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        writePath = os.path.join(writePath,'')
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type']
+        conf=100
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+            
+        strPos = writePath + baseName
+
+        """ for each group of coordinates """
+        for i in coords:
+
+            [id,x,y,w,h,degrees,text,qual,script,text_type] = i
+                    
+            contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)])
+
+            """
+            First rotate around upper left corner based on orientationD keyword
+            """
+            M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180)
+            rot = np.int0(rot)
+
+            # rot is the 8 points rotated by degrees
+            # pgrot is the rotation after extraction, so save
+
+            # save rotated points to list or array
+            rot = np.reshape(rot,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot)
+            
+            text = text.replace(u'\ufeff','')
+
+            bbrot = degrees
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+
+        # if there are polygons, first save the text
+        for j in polys:
+            arr = []
+            [id,poly_val,text,qual,script,text_type] = j
+            for i in poly_val:
+                arr.append(eval(i))
+
+            contour = np.asarray(arr)
+            convex = cv2.convexHull(contour)
+            rect = cv2.minAreaRect(convex)
+            box = cv2.boxPoints(rect)
+            box = np.int0(box)
+            box = np.reshape(box,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+            
+            bbrot = 0.0
+            
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+            
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+                
+        return write_ctr
+    
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+        
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    
+    """
+    Get all XML files in the directory and sub folders
+    """
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                """ read the XML file """
+                tree = ET.parse(fullName)
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+                if args.ftype == 'boxed':
+                    fileTypeStr = 'col'
+                elif args.ftype == 'transcribed':
+                    fileTypeStr = 'Text_Content'
+                else:
+                    print('Filetype must be either boxed or transcribed!')
+                    logger.info('Filetype must be either boxed or transcribed!')
+                    sys.exit(-1)
+                
+                if args.quality == 'both':
+                    qualset = {'Regular','Low-Quality'}
+                elif args.quality == 'low':
+                    qualset = {'Low-Quality'}
+                elif args.quality == 'regular':
+                    qualset = {'Regular'}
+                else:
+                    print('Quality must be both, low or regular!')
+                    logger.info('Quality must be both, low or regular!')
+                    sys.exit(-1)
+                    
+                    
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+                        
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \
+                            ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset:
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')])
+                            elif zone.get(fileTypeStr) != None:
+                                keyCnt+=1
+                                coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']),
+                                                    int(zone.attrib['width']), int(zone.attrib['height']),
+                                                    float(zone.get('orientationD',0.0)),
+                                                    zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]
+                                coordinates.append(coord)
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no applicable content' % (baseName[0]))
+
+    print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', required=True)
+    parser.add_argument('--outputDir', type=str, help='Output directory', required=True)
+    parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed')
+    parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular')
+    parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log')
+
+    return parser.parse_args(argv)
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
+
+
+
+    
+
+
diff --git a/egs/yomdle_fa/v1/local/prepare_dict.sh b/egs/yomdle_fa/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..f1b1a8d70cc
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/prepare_dict.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+data_dir=data
+
+. ./utils/parse_options.sh || exit 1;
+
+base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2)
+
+mkdir -p $dir
+
+local/prepare_lexicon.py --data-dir $data_dir $dir
+
+sed -i '/^\s*$/d' $dir/lexicon.txt
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_fa/v1/local/prepare_lexicon.py b/egs/yomdle_fa/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..46be4f37970
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/prepare_lexicon.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+parser.add_argument('--data-dir', type=str, default='data', help='Path to text file')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join(args.data_dir, 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split(' ')
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join([ 'SIL' if char == '|' else char for char in characters])
+            characters = characters.replace('#','<HASH>')
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_fa/v1/local/process_data.py b/egs/yomdle_fa/v1/local/process_data.py
new file mode 100755
index 00000000000..3423cc5380e
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/process_data.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Farsi OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+count = 0
+for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))):
+    if filename.endswith('.csv') and (count < args.head or args.head < 0):
+        count = count + 1
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        row_count = 0
+        for row in csv.reader(csv_file):
+            if row_count == 0:
+                row_count = 1
+                continue
+            image_id = os.path.splitext(row[1])[0]
+            image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1])
+            text = unicodedata.normalize('NFC', row[11])
+            file_info = os.stat(image_filepath)
+            if file_info.st_size != 0:
+                if text:
+                    text_fh.write(image_id + ' ' + text + '\n')
+                    utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n')
+                    image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] +  '\n')
diff --git a/egs/yomdle_fa/v1/local/score.sh b/egs/yomdle_fa/v1/local/score.sh
new file mode 100755
index 00000000000..f2405205f02
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@"
+steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@"
diff --git a/egs/yomdle_fa/v1/local/train_lm.sh b/egs/yomdle_fa/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bc738f217da
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/train_lm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_fa/v1/local/train_lm_lr.sh b/egs/yomdle_fa/v1/local/train_lm_lr.sh
new file mode 100755
index 00000000000..5bfc20acdeb
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/train_lm_lr.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE+Extra training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+extra_lm=download/extra_lm.txt
+order=3
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
+  
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='extra_lm=10 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_fa/v1/local/wer_output_filter b/egs/yomdle_fa/v1/local/wer_output_filter
new file mode 100755
index 00000000000..08d5563bca4
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/wer_output_filter
@@ -0,0 +1,151 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+# Arabic-specific normalization
+while (<>) {
+  @F = split " ";
+  print "$F[0] ";
+  foreach $s (@F[1..$#F]) {
+    # Normalize tabs, spaces, and no-break spaces
+    $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g;
+    # Normalize "dots"/"filled-circles" to periods
+    $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g;
+    # Normalize dashes to regular hyphen
+    $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g;
+    # Normalize various parenthesis to regular parenthesis
+    $s =~ s/\x{UFF09}/\x{0029}/g;
+    $s =~ s/\x{UFF08}/\x{0028}/g;
+    
+    # Convert various presentation forms to base form
+    $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g;
+    $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g;
+    $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g;
+    $s =~ s/[\x{FBDD}]+/\x{0677}/g;
+    $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g;
+    $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g;
+    $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g;
+    $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g;
+    $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g;
+    $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g;
+    $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g;
+    $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g;
+    $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g;
+    $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g;
+    $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g;
+    $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g;
+    $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g;
+    $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g;
+    $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g;
+    $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g;
+    $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g;
+    $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g;
+    $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g;
+    $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g;
+    $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g;
+    $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g;
+    $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g;
+    $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g;
+    $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g;
+    $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g;
+    $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g;
+    $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g;
+    $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g;
+    $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g;
+    $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g;
+    $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g;
+    $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g;
+    $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g;
+    $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g;
+    $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g;
+    $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g;
+    $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g;
+    $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g;
+    $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g;
+    $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g;
+    $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g;
+    $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g;
+    $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g;
+    $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g;
+    $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g;
+    $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g;
+    $s =~ s/[\x{FE80}]+/\x{0621}/g;
+    $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g;
+    $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g;
+    $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g;
+    $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g;
+    $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g;
+    $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g;
+    $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g;
+    $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g;
+    $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g;
+    $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g;
+    $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g;
+    $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g;
+    $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g;
+    $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g;
+    $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g;
+    $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g;
+    $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g;
+    $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g;
+    $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g;
+    $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g;
+    $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g;
+    $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g;
+    $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g;
+    $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g;
+
+    # Remove tatweel
+    $s =~ s/\x{0640}//g;
+    # Remove vowels and hamza
+    $s =~ s/[\x{064B}-\x{0655}]+//g;
+    # Remove right-to-left and left-to-right
+    $s =~ s/[\x{200F}\x{200E}]+//g;
+    # Arabic Keheh to Arabic Kaf
+    $s =~ s/\x{06A9}/\x{0643}/g;
+    # Arabic Yeh to Farsi Yeh
+    $s =~ s/\x{064A}/\x{06CC}/g;
+    # Decompose RIAL
+    $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g;
+    # Farsi arabic-indic digits to arabic-indic digits
+    $s =~ s/\x{06F0}/\x{0660}/g;
+    $s =~ s/\x{06F1}/\x{0661}/g;
+    $s =~ s/\x{06F2}/\x{0662}/g;
+    $s =~ s/\x{06F3}/\x{0663}/g;
+    $s =~ s/\x{06F4}/\x{0664}/g;
+    $s =~ s/\x{06F5}/\x{0665}/g;
+    $s =~ s/\x{06F6}/\x{0666}/g;
+    $s =~ s/\x{06F7}/\x{0667}/g;
+    $s =~ s/\x{06F8}/\x{0668}/g;
+    $s =~ s/\x{06F9}/\x{0669}/g;
+    # Arabic-indic digits to digits
+    $s =~ s/\x{0660}/0/g;
+    $s =~ s/\x{0661}/1/g;
+    $s =~ s/\x{0662}/2/g;
+    $s =~ s/\x{0663}/3/g;
+    $s =~ s/\x{0664}/4/g;
+    $s =~ s/\x{0665}/5/g;
+    $s =~ s/\x{0666}/6/g;
+    $s =~ s/\x{0667}/7/g;
+    $s =~ s/\x{0668}/8/g;
+    $s =~ s/\x{0669}/9/g;
+    # Arabic comma to comma
+    $s =~ s/\x{060C}/\x{002C}/g;
+
+    $s =~ s/\|/ /g;
+    if ($s ne "") {
+      print "$s";
+    } else {
+      print "";
+    }
+  }
+  print "\n";
+}
+
diff --git a/egs/yomdle_fa/v1/local/yomdle2csv.py b/egs/yomdle_fa/v1/local/yomdle2csv.py
new file mode 100755
index 00000000000..3641de90324
--- /dev/null
+++ b/egs/yomdle_fa/v1/local/yomdle2csv.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id=  Illegible= polygon=  Language= Text_Content= text_raw=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+
+class GEDI2CSV():
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang']
+        conf=100
+        pgrot = 0
+        bbrot = 0
+        qual = 0
+        script = ''
+
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+
+        strPos = writePath + baseName
+
+        for j in polys:
+            try:
+                arr = []
+                [id,poly_val,text,qual,lang] = j
+                script=None
+                #print(j)
+                for i in poly_val:
+                    if len(i.strip()) > 0:
+                        #print(i)
+                        arr.append(eval(i))
+
+                contour = np.asarray(arr)
+                #print(contour)
+                convex = cv2.convexHull(contour)
+                rect = cv2.minAreaRect(convex)
+                box = cv2.boxPoints(rect)
+                box = np.int0(box)
+                box = np.reshape(box,(-1,1)).T
+                c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+
+                bbrot = 0.0
+
+                rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang])
+
+            except:
+                print('...polygon error %s, %s' % (j, baseName))
+                continue
+
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+
+        return write_ctr
+
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    print('write to %s' % (writePath))
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    file_error_ctr = 0
+    """
+    Get all XML files in the directory and sub folders
+    """
+    print('reading %s' % (args.inputDir))
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                try:
+                    """ read the XML file """
+                    tree = ET.parse(fullName)
+                except:
+                    print('...ERROR parsing %s' % (fullName))
+                    file_error_ctr += 1
+                    continue
+
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' :
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')])
+                            else:
+                                print('...Not polygon')
+
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no text content' % (baseName[0]))
+
+
+    print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml')
+    parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/')
+    parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt')
+
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
diff --git a/egs/yomdle_fa/v1/path.sh b/egs/yomdle_fa/v1/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_fa/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_fa/v1/run.sh b/egs/yomdle_fa/v1/run.sh
new file mode 100755
index 00000000000..a7547b1ee69
--- /dev/null
+++ b/egs/yomdle_fa/v1/run.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+set -e
+stage=0
+nj=60
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+download_dir=data_yomdle_farsi/download/
+extra_lm=download/extra_lm.txt
+data_dir=data_yomdle_farsi
+exp_dir=exp_yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le -1 ]; then
+    local/create_download.sh --database-slam $database_slam \
+        --database-yomdle $database_yomdle \
+        --slam-dir download/slam_farsi \
+        --yomdle-dir download/yomdle_farsi
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p data_slam_farsi/slam
+    mkdir -p data_yomdle_farsi/yomdle
+    local/process_data.py download/slam_farsi data_slam_farsi/slam
+    local/process_data.py download/yomdle_farsi data_yomdle_farsi/yomdle
+    ln -s ../data_slam_farsi/slam ${data_dir}/test
+    ln -s ../data_yomdle_farsi/yomdle ${data_dir}/train
+    image/fix_data_dir.sh ${data_dir}/test
+    image/fix_data_dir.sh ${data_dir}/train
+fi
+
+mkdir -p $data_dir/{train,test}/data
+if [ $stage -le 1 ]; then
+    echo "$0: Obtaining image groups. calling get_image2num_frames"
+    echo "Date: $(date)."
+    image/get_image2num_frames.py --feat-dim 40 $data_dir/train
+    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train
+
+    for datasplit in train test; do
+        echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. "
+        echo "Date: $(date)."
+        local/extract_features.sh --nj $nj --cmd "$cmd" \
+            --feat-dim 40 --num-channels 3 --fliplr true \
+            $data_dir/${datasplit}
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1;
+    done
+
+    echo "$0: Fixing data directory for train dataset"
+    echo "Date: $(date)."
+    utils/fix_data_dir.sh $data_dir/train
+fi
+
+if [ $stage -le 2 ]; then
+    for datasplit in train; do
+        echo "$(date) stage 2: Performing augmentation, it will double training data"
+        local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 40 --fliplr false $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1;
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    echo "$0: Preparing dictionary and lang..."
+    if [ ! -f $data_dir/train/bpe.out ]; then
+        cut -d' ' -f2- $data_dir/train/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out
+        for datasplit in test train train_aug; do
+            cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids
+            cut -d' ' -f2- $data_dir/$datasplit/text | local/bidi.py | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text
+            mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old
+            paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text
+        done
+    fi
+
+    local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict
+    # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+    # So we set --sil-prob to 0.0
+    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+        $data_dir/local/dict "<sil>" $data_dir/lang/temp $data_dir/lang
+    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: Estimating a language model for decoding..."
+    local/train_lm.sh --data-dir $data_dir  --dir $data_dir/local/local_lm
+    utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+        $data_dir/local/dict/lexicon.txt $data_dir/lang_test
+fi
+
+if [ $stage -le 5 ]; then
+    echo "$0: Calling the flat-start chain recipe..."
+    echo "Date: $(date)." 
+    local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 6 ]; then
+    echo "$0: Aligning the training data using the e2e chain model..."
+    echo "Date: $(date)."
+    steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+        $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+    echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+    echo "Date: $(date)."
+    local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 8 ]; then
+    echo "$0: Estimating a language model for lattice rescoring...$(date)"
+    local/train_lm_lr.sh --data-dir $data_dir  --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6
+
+    utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \
+        $data_dir/lang_test $data_dir/lang_test_lr
+    steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \
+        $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr
+fi
diff --git a/egs/yomdle_fa/v1/steps b/egs/yomdle_fa/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_fa/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_fa/v1/utils b/egs/yomdle_fa/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_fa/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file
diff --git a/egs/yomdle_zh/README.txt b/egs/yomdle_zh/README.txt
new file mode 100644
index 00000000000..39d2348ca10
--- /dev/null
+++ b/egs/yomdle_zh/README.txt
@@ -0,0 +1,3 @@
+This directory contains example scripts for OCR on the Yomdle and Slam datasets.
+Training is done on the Yomdle dataset and testing is done on Slam.
+LM rescoring is also done with extra corpus data obtained from various sources (e.g. Hamshahri)
diff --git a/egs/yomdle_zh/v1/cmd.sh b/egs/yomdle_zh/v1/cmd.sh
new file mode 100755
index 00000000000..3c8eb9f93a5
--- /dev/null
+++ b/egs/yomdle_zh/v1/cmd.sh
@@ -0,0 +1,13 @@
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export cmd="queue.pl"
diff --git a/egs/yomdle_zh/v1/image b/egs/yomdle_zh/v1/image
new file mode 120000
index 00000000000..1668ee99922
--- /dev/null
+++ b/egs/yomdle_zh/v1/image
@@ -0,0 +1 @@
+../../cifar/v1/image/
\ No newline at end of file
diff --git a/egs/yomdle_zh/v1/local/augment_data.sh b/egs/yomdle_zh/v1/local/augment_data.sh
new file mode 100755
index 00000000000..34e938db069
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/augment_data.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright   2018 Hossein Hadian
+#             2018 Ashish Arora
+
+# Apache 2.0
+# This script performs data augmentation.
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+srcdir=$1
+outdir=$2
+datadir=$3
+
+mkdir -p $datadir/augmentations
+echo "copying $srcdir to $datadir/augmentations/aug1, allowed length, creating feats.scp"
+
+for set in aug1; do
+  image/copy_data_dir.sh --spk-prefix $set- --utt-prefix $set- \
+    $srcdir $datadir/augmentations/$set
+  cat $srcdir/allowed_lengths.txt > $datadir/augmentations/$set/allowed_lengths.txt
+  local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim $feat_dim \
+    --fliplr $fliplr --augment true $datadir/augmentations/$set
+done
+
+echo " combine original data and data from different augmentations"
+utils/combine_data.sh --extra-files images.scp $outdir $srcdir $datadir/augmentations/aug1
+cat $srcdir/allowed_lengths.txt > $outdir/allowed_lengths.txt
diff --git a/egs/yomdle_zh/v1/local/bidi.py b/egs/yomdle_zh/v1/local/bidi.py
new file mode 100755
index 00000000000..447313a5d02
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/bidi.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+# Copyright   2018 Chun-Chieh Chang
+
+# This script is largely written by Stephen Rawls
+# and uses the python package https://pypi.org/project/PyICU_BiDi/
+# The code leaves right to left text alone and reverses left to right text.
+
+import icu_bidi
+import io
+import sys
+import unicodedata
+# R=strong right-to-left;  AL=strong arabic right-to-left
+rtl_set =  set(chr(i) for i in range(sys.maxunicode)
+               if unicodedata.bidirectional(chr(i)) in ['R','AL'])
+def determine_text_direction(text):
+    # Easy case first
+    for char in text:
+        if char in rtl_set:
+            return icu_bidi.UBiDiLevel.UBIDI_RTL
+    # If we made it here we did not encounter any strongly rtl char
+    return icu_bidi.UBiDiLevel.UBIDI_LTR
+
+def utf8_visual_to_logical(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+    bidi.inverse = True
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_INVERSE_LIKE_DIRECT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT # icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+def utf8_logical_to_visual(text):
+    text_dir = determine_text_direction(text)
+
+    bidi = icu_bidi.Bidi()
+
+    bidi.reordering_mode = icu_bidi.UBiDiReorderingMode.UBIDI_REORDER_DEFAULT
+    bidi.reordering_options = icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_DEFAULT  #icu_bidi.UBiDiReorderingOption.UBIDI_OPTION_INSERT_MARKS
+
+    bidi.set_para(text, text_dir, None)
+
+    res = bidi.get_reordered(0 | icu_bidi.UBidiWriteReorderedOpt.UBIDI_DO_MIRRORING | icu_bidi.UBidiWriteReorderedOpt.UBIDI_KEEP_BASE_COMBINING)
+
+    return res
+
+
+##main##
+sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8")
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8")
+for line in sys.stdin:
+    line = line.strip()
+    line = utf8_logical_to_visual(line)[::-1]
+    sys.stdout.write(line + '\n')
diff --git a/egs/yomdle_zh/v1/local/chain/compare_wer.sh b/egs/yomdle_zh/v1/local/chain/compare_wer.sh
new file mode 100755
index 00000000000..ab880c1adb5
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/compare_wer.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+
+# this script is used for comparing decoding results between systems.
+# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b}
+
+# Copyright      2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+if [ $# == 0 ]; then
+  echo "Usage: $0: <dir1> [<dir2> ... ]"
+  echo "e.g.: $0 exp/chain/cnn{1a,1b}"
+  exit 1
+fi
+
+echo "# $0 $*"
+used_epochs=false
+
+echo -n "# System                     "
+for x in $*; do   printf "% 10s" " $(basename $x)";   done
+echo
+
+echo -n "# WER                        "
+for x in $*; do
+  wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}')
+  printf "% 10s" $wer
+done
+echo
+
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
+
+if $used_epochs; then
+  exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
+fi
+
+echo -n "# Final train prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob           "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final train prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
+
+echo -n "# Final valid prob (xent)    "
+for x in $*; do
+  prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}')
+  printf "% 10s" $prob
+done
+echo
diff --git a/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
new file mode 100755
index 00000000000..4183aa74587
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/run_cnn_e2eali_1b.sh
@@ -0,0 +1,245 @@
+#!/bin/bash
+
+# e2eali_1b is the same as chainali_1a but uses the e2e chain model to get the
+# lattice alignments and to build a tree
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# CER                             15.44     13.57
+# Final train prob               0.0616   -0.0512
+# Final valid prob               0.0390   -0.0718
+# Final train prob (xent)                 -0.6199
+# Final valid prob (xent)                 -0.7448
+
+set -e -o pipefail
+
+data_dir=data
+exp_dir=exp
+
+stage=0
+
+nj=30
+train_set=train
+nnet3_affix=    # affix for exp dirs, e.g. it was _cleaned in tedlium.
+affix=_1b  #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration.
+common_egs_dir=
+reporting_email=
+
+# chain options
+train_stage=-10
+xent_regularize=0.1
+frame_subsampling_factor=4
+# training chunk-options
+chunk_width=340,300,200,100
+num_leaves=1000
+# we don't need extra left/right context for TDNN systems.
+chunk_left_context=0
+chunk_right_context=0
+tdnn_dim=450
+# training options
+srand=0
+remove_egs=true
+lang_test=lang_test
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+e2echain_model_dir=$exp_dir/chain/e2e_cnn_1a
+ali_dir=$exp_dir/chain/e2e_ali_train
+lat_dir=$exp_dir/chain${nnet3_affix}/e2e_${train_set}_lats
+dir=$exp_dir/chain${nnet3_affix}/cnn_e2eali${affix}
+train_data_dir=$data_dir/${train_set}
+tree_dir=$exp_dir/chain${nnet3_affix}/tree_e2e
+
+# the 'lang' directory is created by this script.
+# If you create such a directory with a non-standard topology
+# you should probably name it differently.
+lang=$data_dir/lang_chain
+for f in $train_data_dir/feats.scp $ali_dir/ali.1.gz $ali_dir/final.mdl; do
+  [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1
+done
+
+
+if [ $stage -le 1 ]; then
+  echo "$0: creating lang directory $lang with chain-type topology"
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  if [ -d $lang ]; then
+    if [ $lang/L.fst -nt $data_dir/lang/L.fst ]; then
+      echo "$0: $lang already exists, not overwriting it; continuing"
+    else
+      echo "$0: $lang already exists and seems to be older than data/lang..."
+      echo " ... not sure what to do.  Exiting."
+      exit 1;
+    fi
+  else
+    cp -r $data_dir/lang $lang
+    silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+    nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+    # Use our special topology... note that later on may have to tune this
+    # topology.
+    steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+  fi
+fi
+
+if [ $stage -le 2 ]; then
+  # Get the alignments as lattices (gives the chain training more freedom).
+  # use the same num-jobs as the alignments
+  steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \
+                            --acoustic-scale 1.0 \
+                            --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \
+                            ${train_data_dir} $data_dir/lang $e2echain_model_dir $lat_dir
+  echo "" >$lat_dir/splice_opts
+
+fi
+
+if [ $stage -le 3 ]; then
+  # Build a tree using our new topology.  We know we have alignments for the
+  # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use
+  # those.  The num-leaves is always somewhat less than the num-leaves from
+  # the GMM baseline.
+  if [ -f $tree_dir/final.mdl ]; then
+    echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it."
+    exit 1;
+  fi
+
+  steps/nnet3/chain/build_tree.sh \
+    --frame-subsampling-factor $frame_subsampling_factor \
+    --alignment-subsampling-factor 1 \
+    --context-opts "--context-width=3 --central-position=1" \
+    --cmd "$cmd" $num_leaves ${train_data_dir} \
+    $lang $ali_dir $tree_dir
+fi
+
+
+if [ $stage -le 4 ]; then
+  mkdir -p $dir
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}')
+  learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python)
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn3 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn6 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn8 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn9 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-8,-4,0,4,8) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+
+  # adding the layers for xent branch
+  # This block prints the configs for a separate output that will be
+  # trained with a cross-entropy objective in the 'chain' mod?els... this
+  # has the effect of regularizing the hidden parts of the model.  we use
+  # 0.5 / args.xent_regularize as the learning rate factor- the factor of
+  # 0.5 / args.xent_regularize is suitable as it means the xent
+  # final-layer learns at a rate independent of the regularization
+  # constant; and the 0.5 was tuned so as to make the relative progress
+  # similar in the xent and regular final layers.
+  relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts
+  output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts
+EOF
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/
+fi
+
+
+if [ $stage -le 5 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage
+  fi
+
+  steps/nnet3/chain/train.py --stage=$train_stage \
+    --cmd="$cmd" \
+    --feat.cmvn-opts="--norm-means=false --norm-vars=false" \
+    --chain.xent-regularize $xent_regularize \
+    --chain.leaky-hmm-coefficient=0.1 \
+    --chain.l2-regularize=0.00005 \
+    --chain.apply-deriv-weights=false \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=500" \
+    --chain.frame-subsampling-factor=$frame_subsampling_factor \
+    --chain.alignment-subsampling-factor=1 \
+    --chain.left-tolerance 3 \
+    --chain.right-tolerance 3 \
+    --trainer.srand=$srand \
+    --trainer.max-param-change=2.0 \
+    --trainer.num-epochs=6 \
+    --trainer.frames-per-iter=1000000 \
+    --trainer.optimization.num-jobs-initial=4 \
+    --trainer.optimization.num-jobs-final=8 \
+    --trainer.optimization.initial-effective-lrate=0.001 \
+    --trainer.optimization.final-effective-lrate=0.0001 \
+    --trainer.optimization.shrink-value=1.0 \
+    --trainer.num-chunk-per-minibatch=16,8 \
+    --trainer.optimization.momentum=0.0 \
+    --egs.chunk-width=$chunk_width \
+    --egs.chunk-left-context=$chunk_left_context \
+    --egs.chunk-right-context=$chunk_right_context \
+    --egs.chunk-left-context-initial=0 \
+    --egs.chunk-right-context-final=0 \
+    --egs.dir="$common_egs_dir" \
+    --egs.opts="--frames-overlap-per-eg 0 --constrained false" \
+    --cleanup.remove-egs=$remove_egs \
+    --use-gpu=true \
+    --reporting.email="$reporting_email" \
+    --feat-dir=$train_data_dir \
+    --tree-dir=$tree_dir \
+    --lat-dir=$lat_dir \
+    --dir=$dir  || exit 1;
+fi
+
+if [ $stage -le 6 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 7 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --extra-left-context $chunk_left_context \
+    --extra-right-context $chunk_right_context \
+    --extra-left-context-initial 0 \
+    --extra-right-context-final 0 \
+    --frames-per-chunk $frames_per_chunk \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
diff --git a/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
new file mode 100755
index 00000000000..88bbd32790c
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/chain/run_flatstart_cnn1a.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+# Copyright    2017  Hossein Hadian
+
+# This script does end2end chain training (i.e. from scratch)
+
+# ./local/chain/compare_wer.sh exp_yomdle_chinese/chain/e2e_cnn_1a exp_yomdle_chinese/chain/cnn_e2eali_1b
+# System                      e2e_cnn_1a cnn_e2eali_1b
+# CER                             15.44     13.57
+# Final train prob               0.0616   -0.0512
+# Final valid prob               0.0390   -0.0718
+# Final train prob (xent)                 -0.6199
+# Final valid prob (xent)                 -0.7448
+
+set -e
+
+data_dir=data
+exp_dir=exp
+
+# configs for 'chain'
+stage=0
+nj=30
+train_stage=-10
+get_egs_stage=-10
+affix=1a
+
+# training options
+tdnn_dim=450
+num_epochs=4
+num_jobs_initial=4
+num_jobs_final=8
+minibatch_size=150=64,32/300=32,16/600=16,8/1200=8,4
+common_egs_dir=
+l2_regularize=0.00005
+frames_per_iter=1000000
+cmvn_opts="--norm-means=false --norm-vars=false"
+train_set=train
+lang_test=lang_test
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+lang=$data_dir/lang_e2e
+treedir=$exp_dir/chain/e2e_monotree  # it's actually just a trivial tree (no tree building)
+dir=$exp_dir/chain/e2e_cnn_${affix}
+
+if [ $stage -le 0 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r $data_dir/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 1 ]; then
+  steps/nnet3/chain/e2e/prepare_e2e.sh --nj $nj --cmd "$cmd" \
+                                       --shared-phones true \
+                                       --type mono \
+                                       $data_dir/$train_set $lang $treedir
+  $cmd $treedir/log/make_phone_lm.log \
+  cat $data_dir/$train_set/text \| \
+    steps/nnet3/chain/e2e/text_to_phones.py $data_dir/lang \| \
+    utils/sym2int.pl -f 2- $data_dir/lang/phones.txt \| \
+    chain-est-phone-lm --num-extra-lm-states=500 \
+                       ark:- $treedir/phone_lm.fst
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: creating neural net configs using the xconfig parser";
+  num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}')
+  
+  cnn_opts="l2-regularize=0.075"
+  tdnn_opts="l2-regularize=0.075"
+  output_opts="l2-regularize=0.1"
+
+  common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=32"
+  common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=128"
+  common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=512"
+  mkdir -p $dir/configs
+  cat <<EOF > $dir/configs/network.xconfig
+  input dim=180 name=input
+  conv-relu-batchnorm-layer name=cnn1 height-in=60 height-out=60 time-offsets=-3,-2,-1,0,1,2,3 $common1
+  conv-relu-batchnorm-layer name=cnn2 height-in=60 height-out=30 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn3 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn4 height-in=30 height-out=30 time-offsets=-4,-2,0,2,4 $common2
+  conv-relu-batchnorm-layer name=cnn5 height-in=30 height-out=15 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2
+  conv-relu-batchnorm-layer name=cnn6 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  conv-relu-batchnorm-layer name=cnn7 height-in=15 height-out=15 time-offsets=-4,0,4 $common3
+  relu-batchnorm-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts
+  ## adding the layers for chain branch
+  relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts
+  output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts
+EOF
+
+  steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs
+fi
+
+if [ $stage -le 3 ]; then
+  # no need to store the egs in a shared storage because we always
+  # remove them. Anyway, it takes only 5 minutes to generate them.
+
+  steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \
+    --cmd "$cmd" \
+    --feat.cmvn-opts "$cmvn_opts" \
+    --chain.leaky-hmm-coefficient 0.1 \
+    --chain.l2-regularize $l2_regularize \
+    --chain.apply-deriv-weights false \
+    --egs.dir "$common_egs_dir" \
+    --egs.stage $get_egs_stage \
+    --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
+    --chain.frame-subsampling-factor 4 \
+    --chain.alignment-subsampling-factor 4 \
+    --trainer.add-option="--optimization.memory-compression-level=2" \
+    --trainer.num-chunk-per-minibatch $minibatch_size \
+    --trainer.frames-per-iter $frames_per_iter \
+    --trainer.num-epochs $num_epochs \
+    --trainer.optimization.momentum 0 \
+    --trainer.optimization.num-jobs-initial $num_jobs_initial \
+    --trainer.optimization.num-jobs-final $num_jobs_final \
+    --trainer.optimization.initial-effective-lrate 0.001 \
+    --trainer.optimization.final-effective-lrate 0.0001 \
+    --trainer.optimization.shrink-value 1.0 \
+    --trainer.max-param-change 2.0 \
+    --cleanup.remove-egs true \
+    --feat-dir $data_dir/${train_set} \
+    --tree-dir $treedir \
+    --dir $dir  || exit 1;
+fi
+
+if [ $stage -le 4 ]; then
+  # The reason we are using data/lang here, instead of $lang, is just to
+  # emphasize that it's not actually important to give mkgraph.sh the
+  # lang directory with the matched topology (since it gets the
+  # topology file from the model).  So you could give it a different
+  # lang directory, one that contained a wordlist and LM of your choice,
+  # as long as phones.txt was compatible.
+
+  utils/mkgraph.sh \
+    --self-loop-scale 1.0 $data_dir/$lang_test \
+    $dir $dir/graph || exit 1;
+fi
+
+if [ $stage -le 5 ]; then
+  frames_per_chunk=$(echo $chunk_width | cut -d, -f1)
+  steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+    --nj $nj --cmd "$cmd" \
+    $dir/graph $data_dir/test $dir/decode_test || exit 1;
+fi
+
+echo "Done. Date: $(date). Results:"
+local/chain/compare_wer.sh $dir
diff --git a/egs/yomdle_zh/v1/local/create_download.sh b/egs/yomdle_zh/v1/local/create_download.sh
new file mode 100755
index 00000000000..3c4be4699ef
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/create_download.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright 2018 Chun-Chieh Chang
+
+# The original format of the dataset given is GEDI and page images.
+# This script is written to create line images from page images.
+# It also creates csv files from the GEDI files.
+
+database_slam=/export/corpora5/slam/SLAM/Farsi/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_farsi
+cangjie_url=https://raw.githubusercontent.com/wanleung/libcangjie/master/tables/cj5-cc.txt
+download_dir=download
+slam_dir=$download_dir/slam_farsi
+yomdle_dir=$download_dir/yomdle_farsi
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1; 
+
+echo "$0: Processing SLAM ${language}"
+echo "Date: $(date)."
+mkdir -p ${slam_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/gedi2csv.py \
+    --inputDir ${database_slam} \
+    --outputDir ${slam_dir}/truth_csv_raw \
+    --log ${slam_dir}/GEDI2CSV_enriched.log
+local/create_line_image_from_page_image.py \
+    ${database_slam} \
+    ${slam_dir}/truth_csv_raw \
+    ${slam_dir}
+
+echo "$0: Processing YOMDLE ${language}"
+echo "Date: $(date)."
+mkdir -p ${yomdle_dir}/{truth_csv,truth_csv_raw,truth_line_image}
+local/yomdle2csv.py \
+    --inputDir ${database_yomdle} \
+    --outputDir ${yomdle_dir}/truth_csv_raw/ \
+    --log ${yomdle_dir}/YOMDLE2CSV.log
+local/create_line_image_from_page_image.py \
+    --im-format "jpg" \
+    ${database_yomdle}/images \
+    ${yomdle_dir}/truth_csv_raw \
+    ${yomdle_dir}
+
+echo "Downloading table for CangJie."
+wget -P $download_dir/ $cangjie_url || exit 1;
diff --git a/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py
new file mode 100755
index 00000000000..77a6791d5d7
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/create_line_image_from_page_image.py
@@ -0,0 +1,458 @@
+#!/usr/bin/env python3
+
+# Copyright   2018 Ashish Arora
+# Apache 2.0
+# minimum bounding box part in this script is originally from
+#https://github.com/BebeSparkelSparkel/MinimumBoundingBox
+#https://startupnextdoor.com/computing-convex-hull-in-python/
+""" This module will be used for extracting line images from page image.
+ Given the word segmentation (bounding box around a word) for every word, it will
+ extract line segmentation. To extract line segmentation, it will take word bounding
+ boxes of a line as input, will create a minimum area bounding box that will contain
+ all corner points of word bounding boxes. The obtained bounding box (will not necessarily
+ be vertically or horizontally aligned). Hence to extract line image from line bounding box,
+ page image is rotated and line image is cropped and saved.
+"""
+
+import argparse
+import csv
+import itertools
+import sys
+import os
+import numpy as np
+from math import atan2, cos, sin, pi, degrees, sqrt
+from collections import namedtuple
+
+from scipy.spatial import ConvexHull
+from PIL import Image
+from scipy.misc import toimage
+
+parser = argparse.ArgumentParser(description="Creates line images from page image")
+parser.add_argument('image_dir', type=str, help='Path to full page images')
+parser.add_argument('csv_dir', type=str, help='Path to csv files')
+parser.add_argument('out_dir', type=str, help='Path to output directory')
+parser.add_argument('--im-format', type=str, default='png', help='What file format are the images')
+parser.add_argument('--padding', type=int, default=100, help='Padding so BBox does not exceed image area')
+parser.add_argument('--head', type=int, default=-1, help='Number of csv files to process')
+args = parser.parse_args()
+
+"""
+bounding_box is a named tuple which contains:
+             area (float): area of the rectangle
+             length_parallel (float): length of the side that is parallel to unit_vector
+             length_orthogonal (float): length of the side that is orthogonal to unit_vector
+             rectangle_center(int, int): coordinates of the rectangle center
+             (use rectangle_corners to get the corner points of the rectangle)
+             unit_vector (float, float): direction of the length_parallel side.
+             (it's orthogonal vector can be found with the orthogonal_vector function
+             unit_vector_angle (float): angle of the unit vector to be in radians.
+             corner_points [(float, float)]: set that contains the corners of the rectangle
+"""
+
+bounding_box_tuple = namedtuple('bounding_box_tuple', 'area '
+                                        'length_parallel '
+                                        'length_orthogonal '
+                                        'rectangle_center '
+                                        'unit_vector '
+                                        'unit_vector_angle '
+                                        'corner_points'
+                         )
+
+
+def unit_vector(pt0, pt1):
+    """ Given two points pt0 and pt1, return a unit vector that
+        points in the direction of pt0 to pt1.
+    Returns
+    -------
+    (float, float): unit vector
+    """
+    dis_0_to_1 = sqrt((pt0[0] - pt1[0])**2 + (pt0[1] - pt1[1])**2)
+    return (pt1[0] - pt0[0]) / dis_0_to_1, \
+           (pt1[1] - pt0[1]) / dis_0_to_1
+
+
+def orthogonal_vector(vector):
+    """ Given a vector, returns a orthogonal/perpendicular vector of equal length.
+    Returns
+    ------
+    (float, float): A vector that points in the direction orthogonal to vector.
+    """
+    return -1 * vector[1], vector[0]
+
+
+def bounding_area(index, hull):
+    """ Given index location in an array and convex hull, it gets two points
+        hull[index] and hull[index+1]. From these two points, it returns a named
+        tuple that mainly contains area of the box that bounds the hull. This
+        bounding box orintation is same as the orientation of the lines formed
+        by the point hull[index] and hull[index+1].
+    Returns
+    -------
+    a named tuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side.
+    (it's orthogonal vector can be found with the orthogonal_vector function)
+    """
+    unit_vector_p = unit_vector(hull[index], hull[index+1])
+    unit_vector_o = orthogonal_vector(unit_vector_p)
+
+    dis_p = tuple(np.dot(unit_vector_p, pt) for pt in hull)
+    dis_o = tuple(np.dot(unit_vector_o, pt) for pt in hull)
+
+    min_p = min(dis_p)
+    min_o = min(dis_o)
+    len_p = max(dis_p) - min_p
+    len_o = max(dis_o) - min_o
+
+    return {'area': len_p * len_o,
+            'length_parallel': len_p,
+            'length_orthogonal': len_o,
+            'rectangle_center': (min_p + len_p / 2, min_o + len_o / 2),
+            'unit_vector': unit_vector_p,
+            }
+
+
+def to_xy_coordinates(unit_vector_angle, point):
+    """ Given angle from horizontal axis and a point from origin,
+        returns converted unit vector coordinates in x, y coordinates.
+        angle of unit vector should be in radians.
+    Returns
+    ------
+    (float, float): converted x,y coordinate of the unit vector.
+    """
+    angle_orthogonal = unit_vector_angle + pi / 2
+    return point[0] * cos(unit_vector_angle) + point[1] * cos(angle_orthogonal), \
+           point[0] * sin(unit_vector_angle) + point[1] * sin(angle_orthogonal)
+
+
+def rotate_points(center_of_rotation, angle, points):
+    """ Rotates a point cloud around the center_of_rotation point by angle
+    input
+    -----
+    center_of_rotation (float, float): angle of unit vector to be in radians.
+    angle (float): angle of rotation to be in radians.
+    points [(float, float)]: Points to be a list or tuple of points. Points to be rotated.
+    Returns
+    ------
+    [(float, float)]: Rotated points around center of rotation by angle
+    """
+    rot_points = []
+    ang = []
+    for pt in points:
+        diff = tuple([pt[d] - center_of_rotation[d] for d in range(2)])
+        diff_angle = atan2(diff[1], diff[0]) + angle
+        ang.append(diff_angle)
+        diff_length = sqrt(sum([d**2 for d in diff]))
+        rot_points.append((center_of_rotation[0] + diff_length * cos(diff_angle),
+                           center_of_rotation[1] + diff_length * sin(diff_angle)))
+
+    return rot_points
+
+
+def rectangle_corners(rectangle):
+    """ Given rectangle center and its inclination, returns the corner
+        locations of the rectangle.
+    Returns
+    ------
+    [(float, float)]: 4 corner points of rectangle.
+    """
+    corner_points = []
+    for i1 in (.5, -.5):
+        for i2 in (i1, -1 * i1):
+            corner_points.append((rectangle['rectangle_center'][0] + i1 * rectangle['length_parallel'],
+                            rectangle['rectangle_center'][1] + i2 * rectangle['length_orthogonal']))
+
+    return rotate_points(rectangle['rectangle_center'], rectangle['unit_vector_angle'], corner_points)
+
+
+def get_orientation(origin, p1, p2):
+    """
+    Given origin and two points, return the orientation of the Point p1 with
+    regards to Point p2 using origin.
+    Returns
+    -------
+    integer: Negative if p1 is clockwise of p2.
+    """
+    difference = (
+        ((p2[0] - origin[0]) * (p1[1] - origin[1]))
+        - ((p1[0] - origin[0]) * (p2[1] - origin[1]))
+    )
+    return difference
+
+
+def compute_hull(points):
+    """
+    Given input list of points, return a list of points that
+    made up the convex hull.
+    Returns
+    -------
+    [(float, float)]: convexhull points
+    """
+    hull_points = []
+    start = points[0]
+    min_x = start[0]
+    for p in points[1:]:
+        if p[0] < min_x:
+            min_x = p[0]
+            start = p
+
+    point = start
+    hull_points.append(start)
+
+    far_point = None
+    while far_point is not start:
+        p1 = None
+        for p in points:
+            if p is point:
+                continue
+            else:
+                p1 = p
+                break
+
+        far_point = p1
+
+        for p2 in points:
+            if p2 is point or p2 is p1:
+                continue
+            else:
+                direction = get_orientation(point, far_point, p2)
+                if direction > 0:
+                    far_point = p2
+
+        hull_points.append(far_point)
+        point = far_point
+    return hull_points
+
+
+def minimum_bounding_box(points):
+    """ Given a list of 2D points, it returns the minimum area rectangle bounding all
+        the points in the point cloud.
+    Returns
+    ------
+    returns a namedtuple that contains:
+    area: area of the rectangle
+    length_parallel: length of the side that is parallel to unit_vector
+    length_orthogonal: length of the side that is orthogonal to unit_vector
+    rectangle_center: coordinates of the rectangle center
+    unit_vector: direction of the length_parallel side. RADIANS
+    unit_vector_angle: angle of the unit vector
+    corner_points: set that contains the corners of the rectangle
+    """
+
+    if len(points) <= 2: raise ValueError('More than two points required.')
+
+    hull_ordered = [points[index] for index in ConvexHull(points).vertices]
+    hull_ordered.append(hull_ordered[0])
+    #hull_ordered = compute_hull(points)
+    hull_ordered = tuple(hull_ordered)
+
+    min_rectangle = bounding_area(0, hull_ordered)
+    for i in range(1, len(hull_ordered)-1):
+        rectangle = bounding_area(i, hull_ordered)
+        if rectangle['area'] < min_rectangle['area']:
+            min_rectangle = rectangle
+
+    min_rectangle['unit_vector_angle'] = atan2(min_rectangle['unit_vector'][1], min_rectangle['unit_vector'][0])
+    min_rectangle['rectangle_center'] = to_xy_coordinates(min_rectangle['unit_vector_angle'], min_rectangle['rectangle_center'])
+
+    return bounding_box_tuple(
+        area = min_rectangle['area'],
+        length_parallel = min_rectangle['length_parallel'],
+        length_orthogonal = min_rectangle['length_orthogonal'],
+        rectangle_center = min_rectangle['rectangle_center'],
+        unit_vector = min_rectangle['unit_vector'],
+        unit_vector_angle = min_rectangle['unit_vector_angle'],
+        corner_points = set(rectangle_corners(min_rectangle))
+    )
+
+
+def get_center(im):
+    """ Given image, returns the location of center pixel
+    Returns
+    -------
+    (int, int): center of the image
+    """
+    center_x = im.size[0] / 2
+    center_y = im.size[1] / 2
+    return int(center_x), int(center_y)
+
+
+def get_horizontal_angle(unit_vector_angle):
+    """ Given an angle in radians, returns angle of the unit vector in
+        first or fourth quadrant.
+    Returns
+    ------
+    (float): updated angle of the unit vector to be in radians.
+             It is only in first or fourth quadrant.
+    """
+    if unit_vector_angle > pi / 2 and unit_vector_angle <= pi:
+        unit_vector_angle = unit_vector_angle - pi
+    elif unit_vector_angle > -pi and unit_vector_angle < -pi / 2:
+        unit_vector_angle = unit_vector_angle + pi
+
+    return unit_vector_angle
+
+
+def get_smaller_angle(bounding_box):
+    """ Given a rectangle, returns its smallest absolute angle from horizontal axis.
+    Returns
+    ------
+    (float): smallest angle of the rectangle to be in radians.
+    """
+    unit_vector = bounding_box.unit_vector
+    unit_vector_angle = bounding_box.unit_vector_angle
+    ortho_vector = orthogonal_vector(unit_vector)
+    ortho_vector_angle = atan2(ortho_vector[1], ortho_vector[0])
+
+    unit_vector_angle_updated = get_horizontal_angle(unit_vector_angle)
+    ortho_vector_angle_updated = get_horizontal_angle(ortho_vector_angle)
+
+    if abs(unit_vector_angle_updated) < abs(ortho_vector_angle_updated):
+        return unit_vector_angle_updated
+    else:
+        return ortho_vector_angle_updated
+
+
+def rotated_points(bounding_box, center):
+    """ Given the rectangle, returns corner points of rotated rectangle.
+        It rotates the rectangle around the center by its smallest angle.
+    Returns
+    -------
+    [(int, int)]: 4 corner points of rectangle.
+    """
+    p1, p2, p3, p4 = bounding_box.corner_points
+    x1, y1 = p1
+    x2, y2 = p2
+    x3, y3 = p3
+    x4, y4 = p4
+    center_x, center_y = center
+    rotation_angle_in_rad = -get_smaller_angle(bounding_box)
+    x_dash_1 = (x1 - center_x) * cos(rotation_angle_in_rad) - (y1 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_2 = (x2 - center_x) * cos(rotation_angle_in_rad) - (y2 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_3 = (x3 - center_x) * cos(rotation_angle_in_rad) - (y3 - center_y) * sin(rotation_angle_in_rad) + center_x
+    x_dash_4 = (x4 - center_x) * cos(rotation_angle_in_rad) - (y4 - center_y) * sin(rotation_angle_in_rad) + center_x
+
+    y_dash_1 = (y1 - center_y) * cos(rotation_angle_in_rad) + (x1 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_2 = (y2 - center_y) * cos(rotation_angle_in_rad) + (x2 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_3 = (y3 - center_y) * cos(rotation_angle_in_rad) + (x3 - center_x) * sin(rotation_angle_in_rad) + center_y
+    y_dash_4 = (y4 - center_y) * cos(rotation_angle_in_rad) + (x4 - center_x) * sin(rotation_angle_in_rad) + center_y
+    return x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4
+
+
+def pad_image(image):
+    """ Given an image, returns a padded image around the border.
+        This routine save the code from crashing if bounding boxes that are
+        slightly outside the page boundary.
+    Returns
+    -------
+    image: page image
+    """
+    offset = int(args.padding // 2)
+    padded_image = Image.new('RGB', (image.size[0] + int(args.padding), image.size[1] + int(args.padding)), "white")
+    padded_image.paste(im = image, box = (offset, offset))
+    return padded_image
+
+def update_minimum_bounding_box_input(bounding_box_input):
+    """ Given list of 2D points, returns list of 2D points shifted by an offset.
+    Returns
+    ------
+    points [(float, float)]: points, a list or tuple of 2D coordinates
+    """
+    updated_minimum_bounding_box_input = []
+    offset = int(args.padding // 2)
+    for point in bounding_box_input:
+        x, y = point
+        new_x = x + offset
+        new_y = y + offset
+        word_coordinate = (new_x, new_y)
+        updated_minimum_bounding_box_input.append(word_coordinate)
+
+    return updated_minimum_bounding_box_input
+
+
+### main ###
+csv_count = 0
+for filename in sorted(os.listdir(args.csv_dir)):
+    if filename.endswith('.csv') and (csv_count < args.head or args.head < 0):
+        csv_count = csv_count + 1
+        with open(os.path.join(args.csv_dir, filename), 'r', encoding='utf-8') as f:
+            image_file = os.path.join(args.image_dir, os.path.splitext(filename)[0] + '.' + args.im_format)
+            if not os.path.isfile(image_file):
+                continue
+            csv_out_file = os.path.join(args.out_dir, 'truth_csv', filename)
+            csv_out_fh = open(csv_out_file, 'w', encoding='utf-8')
+            csv_out_writer = csv.writer(csv_out_fh)
+            im = Image.open(image_file)
+            im = pad_image(im)
+            count = 1
+            for row in itertools.islice(csv.reader(f), 0, None):
+                if count == 1:
+                    count = 0
+                    continue
+    
+                points = []
+                points.append((int(row[2]), int(row[3])))
+                points.append((int(row[4]), int(row[5])))
+                points.append((int(row[6]), int(row[7])))
+                points.append((int(row[8]), int(row[9])))
+    
+                x = [int(row[2]), int(row[4]), int(row[6]), int(row[8])]
+                y = [int(row[3]), int(row[5]), int(row[7]), int(row[9])]
+                min_x, min_y = min(x), min(y)
+                max_x, max_y = max(x), max(y)
+                if min_x == max_x or min_y == max_y:
+                    continue
+    
+                try:
+                    updated_mbb_input = update_minimum_bounding_box_input(points)
+                    bounding_box = minimum_bounding_box(updated_mbb_input)
+                except Exception as e:
+                    print("Error: Skipping Image " + row[1])
+                    continue
+    
+                p1, p2, p3, p4 = bounding_box.corner_points
+                x1, y1 = p1
+                x2, y2 = p2
+                x3, y3 = p3
+                x4, y4 = p4
+                min_x = int(min(x1, x2, x3, x4))
+                min_y = int(min(y1, y2, y3, y4))
+                max_x = int(max(x1, x2, x3, x4))
+                max_y = int(max(y1, y2, y3, y4))
+                box = (min_x, min_y, max_x, max_y)
+                region_initial = im.crop(box)
+                rot_points = []
+                p1_new = (x1 - min_x, y1 - min_y)
+                p2_new = (x2 - min_x, y2 - min_y)
+                p3_new = (x3 - min_x, y3 - min_y)
+                p4_new = (x4 - min_x, y4 - min_y)
+                rot_points.append(p1_new)
+                rot_points.append(p2_new)
+                rot_points.append(p3_new)
+                rot_points.append(p4_new)
+    
+                cropped_bounding_box = bounding_box_tuple(bounding_box.area,
+                        bounding_box.length_parallel,
+                        bounding_box.length_orthogonal,
+                        bounding_box.length_orthogonal,
+                        bounding_box.unit_vector,
+                        bounding_box.unit_vector_angle,
+                        set(rot_points))
+    
+                rotation_angle_in_rad = get_smaller_angle(cropped_bounding_box)
+                img2 = region_initial.rotate(degrees(rotation_angle_in_rad), resample = Image.BICUBIC)
+                x_dash_1, y_dash_1, x_dash_2, y_dash_2, x_dash_3, y_dash_3, x_dash_4, y_dash_4 = rotated_points(
+                    cropped_bounding_box, get_center(region_initial))
+    
+                min_x = int(min(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                min_y = int(min(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                max_x = int(max(x_dash_1, x_dash_2, x_dash_3, x_dash_4))
+                max_y = int(max(y_dash_1, y_dash_2, y_dash_3, y_dash_4))
+                box = (min_x, min_y, max_x, max_y)
+                region_final = img2.crop(box)
+                csv_out_writer.writerow(row)
+                image_out_file = os.path.join(args.out_dir, 'truth_line_image', row[1])
+                region_final.save(image_out_file)
diff --git a/egs/yomdle_zh/v1/local/extract_features.sh b/egs/yomdle_zh/v1/local/extract_features.sh
new file mode 100755
index 00000000000..7d6806a2712
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/extract_features.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+# Copyright   2017 Yiwen Shao
+#             2018 Ashish Arora
+
+nj=4
+cmd=run.pl
+feat_dim=40
+fliplr=false
+augment=false
+num_channels=3
+echo "$0 $@"
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh || exit 1;
+
+data=$1
+featdir=$data/data
+scp=$data/images.scp
+logdir=$data/log
+
+mkdir -p $logdir
+mkdir -p $featdir
+
+# make $featdir an absolute pathname
+featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}`
+
+for n in $(seq $nj); do
+    split_scps="$split_scps $logdir/images.$n.scp"
+done
+
+# split images.scp
+utils/split_scp.pl $scp $split_scps || exit 1;
+
+$cmd JOB=1:$nj $logdir/extract_features.JOB.log \
+  image/ocr/make_features.py $logdir/images.JOB.scp \
+    --allowed_len_file_path $data/allowed_lengths.txt \
+    --feat-dim $feat_dim --num-channels $num_channels --fliplr $fliplr --augment $augment \| \
+    copy-feats --compress=true --compression-method=7 \
+    ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp
+
+## aggregates the output scp's to get feats.scp
+for n in $(seq $nj); do
+  cat $featdir/images.$n.scp || exit 1;
+done > $data/feats.scp || exit 1
diff --git a/egs/yomdle_zh/v1/local/gedi2csv.py b/egs/yomdle_zh/v1/local/gedi2csv.py
new file mode 100755
index 00000000000..43a07421dd1
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/gedi2csv.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id= col= row= width= height= Language= Quality= Overlay= Script= Type= Text_Content=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+    
+class GEDI2CSV():
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        writePath = os.path.join(writePath,'')
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','text_type']
+        conf=100
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+            
+        strPos = writePath + baseName
+
+        """ for each group of coordinates """
+        for i in coords:
+
+            [id,x,y,w,h,degrees,text,qual,script,text_type] = i
+                    
+            contour = np.array([(x,y),(x+w,y),(x+w,y+h),(x,y+h)])
+
+            """
+            First rotate around upper left corner based on orientationD keyword
+            """
+            M, rot = Rotate2D(contour, np.array([x,y]), degrees*pi/180)
+            rot = np.int0(rot)
+
+            # rot is the 8 points rotated by degrees
+            # pgrot is the rotation after extraction, so save
+
+            # save rotated points to list or array
+            rot = np.reshape(rot,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(rot)
+            
+            text = text.replace(u'\ufeff','')
+
+            bbrot = degrees
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+
+        # if there are polygons, first save the text
+        for j in polys:
+            arr = []
+            [id,poly_val,text,qual,script,text_type] = j
+            for i in poly_val:
+                arr.append(eval(i))
+
+            contour = np.asarray(arr)
+            convex = cv2.convexHull(contour)
+            rect = cv2.minAreaRect(convex)
+            box = cv2.boxPoints(rect)
+            box = np.int0(box)
+            box = np.reshape(box,(-1,1)).T
+            c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+            
+            bbrot = 0.0
+            
+            rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,text_type])
+            
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+                
+        return write_ctr
+    
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+        
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    
+    """
+    Get all XML files in the directory and sub folders
+    """
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                """ read the XML file """
+                tree = ET.parse(fullName)
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+                if args.ftype == 'boxed':
+                    fileTypeStr = 'col'
+                elif args.ftype == 'transcribed':
+                    fileTypeStr = 'Text_Content'
+                else:
+                    print('Filetype must be either boxed or transcribed!')
+                    logger.info('Filetype must be either boxed or transcribed!')
+                    sys.exit(-1)
+                
+                if args.quality == 'both':
+                    qualset = {'Regular','Low-Quality'}
+                elif args.quality == 'low':
+                    qualset = {'Low-Quality'}
+                elif args.quality == 'regular':
+                    qualset = {'Regular'}
+                else:
+                    print('Quality must be both, low or regular!')
+                    logger.info('Quality must be both, low or regular!')
+                    sys.exit(-1)
+                    
+                    
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+                        
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' and zone.attrib['Type'] in \
+                            ('Machine_Print','Confusable_Allograph','Handwriting') and zone.attrib['Quality'] in qualset:
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')])
+                            elif zone.get(fileTypeStr) != None:
+                                keyCnt+=1
+                                coord = [zone.attrib['id'],int(zone.attrib['col']),int(zone.attrib['row']),
+                                                    int(zone.attrib['width']), int(zone.attrib['height']),
+                                                    float(zone.get('orientationD',0.0)),
+                                                    zone.get('Text_Content'),zone.get('Quality'),zone.get('Script'),zone.get('Type')]
+                                coordinates.append(coord)
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no applicable content' % (baseName[0]))
+
+    print('complete...total files %d, lines written %d' % (fileCnt, line_write_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', required=True)
+    parser.add_argument('--outputDir', type=str, help='Output directory', required=True)
+    parser.add_argument('--ftype', type=str, help='GEDI file type (either "boxed" or "transcribed")', default='transcribed')
+    parser.add_argument('--quality', type=str, help='GEDI file quality (either "both" or "low" or "regular")', default='regular')
+    parser.add_argument('--log', type=str, help='Log directory', default='./GEDI2CSV_enriched.log')
+
+    return parser.parse_args(argv)
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
+
+
+
+    
+
+
diff --git a/egs/yomdle_zh/v1/local/prepare_dict.sh b/egs/yomdle_zh/v1/local/prepare_dict.sh
new file mode 100755
index 00000000000..65b2e7aa901
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/prepare_dict.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# Copyright      2017  Hossein Hadian
+#                2017  Chun Chieh Chang
+#                2017  Ashish Arora
+
+# This script prepares the dictionary.
+
+set -e
+dir=data/local/dict
+data_dir=data
+
+. ./utils/parse_options.sh || exit 1;
+
+base_dir=$(echo "$DIRECTORY" | cut -d "/" -f2)
+
+mkdir -p $dir
+
+local/prepare_lexicon.py --data-dir $data_dir $dir
+
+cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1;
+
+echo '<sil> SIL' >> $dir/lexicon.txt
+
+echo SIL > $dir/silence_phones.txt
+
+echo SIL >$dir/optional_silence.txt
+
+echo -n "" >$dir/extra_questions.txt
diff --git a/egs/yomdle_zh/v1/local/prepare_lexicon.py b/egs/yomdle_zh/v1/local/prepare_lexicon.py
new file mode 100755
index 00000000000..3ebb52e38f4
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/prepare_lexicon.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# Copyright  2018  Ashish Arora
+#                  Chun-Chieh Chang
+
+import argparse
+import os
+
+parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""")
+parser.add_argument('dir', type=str, help='output path')
+parser.add_argument('--data-dir', type=str, default='data', help='Path to text file')
+args = parser.parse_args()
+
+### main ###
+lex = {}
+text_path = os.path.join(args.data_dir, 'train', 'text')
+text_fh = open(text_path, 'r', encoding='utf-8')
+
+# Used specially for Chinese.
+# Uses the ChangJie keyboard input method to create subword units for Chinese.
+cj5_table = {}
+with open('download/cj5-cc.txt', 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        if not line_vect[0].startswith('yyy') and not line_vect[0].startswith('z'):
+            cj5_table[line_vect[1]] = "cj5_" + " cj5_".join(list(line_vect[0]))
+
+with open(text_path, 'r', encoding='utf-8') as f:
+    for line in f:
+        line_vect = line.strip().split()
+        for i in range(1, len(line_vect)):
+            characters = list(line_vect[i])
+	    # Put SIL instead of "|". Because every "|" in the beginning of the words is for initial-space of that word
+            characters = " ".join([ 'SIL' if char == '|' else cj5_table[char] if char in cj5_table else char for char in characters])
+            characters = characters.replace('#','<HASH>')
+            lex[line_vect[i]] = characters
+
+with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp:
+    for key in sorted(lex):
+        fp.write(key + " " + lex[key] + "\n")
diff --git a/egs/yomdle_zh/v1/local/process_data.py b/egs/yomdle_zh/v1/local/process_data.py
new file mode 100755
index 00000000000..8964af8890a
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/process_data.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+# Copyright      2018  Ashish Arora
+#                2018  Chun Chieh Chang
+
+""" This script reads the extracted Farsi OCR (yomdle and slam) database files 
+    and creates the following files (for the data subset selected via --dataset):
+    text, utt2spk, images.scp.
+  Eg. local/process_data.py data/download/ data/local/splits/train.txt data/train
+  Eg. text file: english_phone_books_0001_1 To sum up, then, it would appear that
+      utt2spk file: english_phone_books_0001_0 english_phone_books_0001
+      images.scp file: english_phone_books_0001_0 \
+      data/download/truth_line_image/english_phone_books_0001_0.png
+"""
+
+import argparse
+import os
+import sys
+import csv
+import itertools
+import unicodedata
+
+parser = argparse.ArgumentParser(description="Creates text, utt2spk, and images.scp files")
+parser.add_argument('database_path', type=str, help='Path to data')
+parser.add_argument('out_dir', type=str, help='directory to output files')
+parser.add_argument('--head', type=int, default=-1, help='limit on number of synth data')
+args = parser.parse_args()
+
+### main ###
+print("Processing '{}' data...".format(args.out_dir))
+
+text_file = os.path.join(args.out_dir, 'text')
+text_fh = open(text_file, 'w', encoding='utf-8')
+utt2spk_file = os.path.join(args.out_dir, 'utt2spk')
+utt2spk_fh = open(utt2spk_file, 'w', encoding='utf-8')
+image_file = os.path.join(args.out_dir, 'images.scp')
+image_fh = open(image_file, 'w', encoding='utf-8')
+
+count = 0
+for filename in sorted(os.listdir(os.path.join(args.database_path, 'truth_csv'))):
+    if filename.endswith('.csv') and (count < args.head or args.head < 0):
+        count = count + 1
+        csv_filepath = os.path.join(args.database_path, 'truth_csv', filename)
+        csv_file = open(csv_filepath, 'r', encoding='utf-8')
+        row_count = 0
+        for row in csv.reader(csv_file):
+            if row_count == 0:
+                row_count = 1
+                continue
+            image_id = os.path.splitext(row[1])[0]
+            image_filepath = os.path.join(args.database_path, 'truth_line_image', row[1])
+            text = unicodedata.normalize('NFC', row[11]).replace('\n', '')
+            if os.path.isfile(image_filepath) and os.stat(image_filepath).st_size != 0 and text:
+                text_fh.write(image_id + ' ' + text + '\n')
+                utt2spk_fh.write(image_id + ' ' + '_'.join(image_id.split('_')[:-1]) + '\n')
+                image_fh.write(image_id + ' ' + image_filepath + ' ' + row[13] +  '\n')
diff --git a/egs/yomdle_zh/v1/local/score.sh b/egs/yomdle_zh/v1/local/score.sh
new file mode 100755
index 00000000000..f2405205f02
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/score.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+
+steps/scoring/score_kaldi_wer.sh --max-lmwt 10 "$@"
+steps/scoring/score_kaldi_cer.sh --max-lmwt 10 --stage 2 "$@"
diff --git a/egs/yomdle_zh/v1/local/train_lm.sh b/egs/yomdle_zh/v1/local/train_lm.sh
new file mode 100755
index 00000000000..bc738f217da
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/train_lm.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+order=3
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_zh/v1/local/train_lm_lr.sh b/egs/yomdle_zh/v1/local/train_lm_lr.sh
new file mode 100755
index 00000000000..5bfc20acdeb
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/train_lm_lr.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+
+# Copyright 2016  Vincent Nguyen
+#           2016  Johns Hopkins University (author: Daniel Povey)
+#           2017  Ashish Arora
+#           2017  Hossein Hadian
+# Apache 2.0
+#
+# This script trains a LM on the YOMDLE+Extra training transcriptions.
+# It is based on the example scripts distributed with PocoLM
+
+# It will check if pocolm is installed and if not will proceed with installation
+
+set -e
+stage=0
+dir=data/local/local_lm
+data_dir=data
+extra_lm=download/extra_lm.txt
+order=3
+
+echo "$0 $@"  # Print the command line for logging
+. ./utils/parse_options.sh || exit 1;
+
+lm_dir=${dir}/data
+
+
+mkdir -p $dir
+. ./path.sh || exit 1; # for KALDI_ROOT
+export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH
+( # First make sure the pocolm toolkit is installed.
+ cd $KALDI_ROOT/tools || exit 1;
+ if [ -d pocolm ]; then
+   echo Not installing the pocolm toolkit since it is already there.
+ else
+   echo "$0: Please install the PocoLM toolkit with: "
+   echo " cd ../../../tools; extras/install_pocolm.sh; cd -"
+   exit 1;
+ fi
+) || exit 1;
+
+bypass_metaparam_optim_opt=
+# If you want to bypass the metaparameter optimization steps with specific metaparameters
+# un-comment the following line, and change the numbers to some appropriate values.
+# You can find the values from output log of train_lm.py.
+# These example numbers of metaparameters is for 4-gram model (with min-counts)
+# running with train_lm.py.
+# The dev perplexity should be close to the non-bypassed model.
+#bypass_metaparam_optim_opt=
+# Note: to use these example parameters, you may need to remove the .done files
+# to make sure the make_lm_dir.py be called and tain only 3-gram model
+#for order in 3; do
+#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
+
+if [ $stage -le 0 ]; then
+  mkdir -p ${dir}/data
+  mkdir -p ${dir}/data/text
+
+  echo "$0: Getting the Data sources"
+
+  rm ${dir}/data/text/* 2>/dev/null || true
+
+  cat ${extra_lm} | local/bidi.py | utils/lang/bpe/prepend_words.py --encoding 'utf-8' | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > ${dir}/data/text/extra_lm.txt
+  
+  # Note: the name 'dev' is treated specially by pocolm, it automatically
+  # becomes the dev set.
+  nr=`cat $data_dir/train/text | wc -l`
+  nr_dev=$(($nr / 10 ))
+  nr_train=$(( $nr - $nr_dev ))
+
+  # use the training data as an additional data source.
+  # we can later fold the dev data into this.
+  head -n $nr_train $data_dir/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
+  tail -n $nr_dev $data_dir/train/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt
+
+  # for reporting perplexities, we'll use the "real" dev set.
+  # (the validation data is used as ${dir}/data/text/dev.txt to work
+  # out interpolation weights.)
+  # note, we can't put it in ${dir}/data/text/, because then pocolm would use
+  # it as one of the data sources.
+  cut -d " " -f 2-  < $data_dir/test/text  > ${dir}/data/real_dev_set.txt
+
+  # get the wordlist from MADCAT text
+  cat ${dir}/data/text/{train,extra_lm}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  #cat ${dir}/data/text/extra_fa.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+fi
+
+if [ $stage -le 1 ]; then
+  # decide on the vocabulary.
+  # Note: you'd use --wordlist if you had a previously determined word-list
+  # that you wanted to use.
+  # Note: if you have more than one order, use a certain amount of words as the
+  # vocab and want to restrict max memory for 'sort',
+  echo "$0: training the unpruned LM"
+  min_counts='extra_lm=10 train=1'
+  wordlist=${dir}/data/wordlist
+
+  lm_name="`basename ${wordlist}`_${order}"
+  if [ -n "${min_counts}" ]; then
+    lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
+  fi
+  unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
+  train_lm.py  --wordlist=${wordlist} --num-splits=30 --warm-start-ratio=1 \
+               --min-counts="$min_counts" \
+               --limit-unk-history=true \
+               ${bypass_metaparam_optim_opt} \
+               ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
+
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
+fi
diff --git a/egs/yomdle_zh/v1/local/wer_output_filter b/egs/yomdle_zh/v1/local/wer_output_filter
new file mode 100755
index 00000000000..08d5563bca4
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/wer_output_filter
@@ -0,0 +1,151 @@
+#!/usr/bin/env perl
+# Copyright 2012-2014  Johns Hopkins University (Author: Yenda Trmal)
+# Apache 2.0
+
+use utf8;
+
+use open qw(:encoding(utf8));
+binmode STDIN, ":utf8";
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+
+# Arabic-specific normalization
+while (<>) {
+  @F = split " ";
+  print "$F[0] ";
+  foreach $s (@F[1..$#F]) {
+    # Normalize tabs, spaces, and no-break spaces
+    $s =~ s/[\x{0009}\x{0020}\x{00A0}]+/ /g;
+    # Normalize "dots"/"filled-circles" to periods
+    $s =~ s/[\x{25CF}\x{u2022}\x{2219}]+/\x{002E}/g;
+    # Normalize dashes to regular hyphen
+    $s =~ s/[\x{2010}\x{2011}\x{2012}\x{2013}\x{2014}\x{2015}]+/\x{002D}/g;
+    # Normalize various parenthesis to regular parenthesis
+    $s =~ s/\x{UFF09}/\x{0029}/g;
+    $s =~ s/\x{UFF08}/\x{0028}/g;
+    
+    # Convert various presentation forms to base form
+    $s =~ s/[\x{FED1}\x{FED3}\x{FED4}\x{FED2}]+/\x{0641}/g;
+    $s =~ s/[\x{FBB0}\x{FBB1}]+/\x{06D3}/g;
+    $s =~ s/[\x{FECD}\x{FECF}\x{FED0}\x{FECE}]+/\x{063A}/g;
+    $s =~ s/[\x{FBDD}]+/\x{0677}/g;
+    $s =~ s/[\x{FBA6}\x{FBA8}\x{FBA9}\x{FBA7}]+/\x{06C1}/g;
+    $s =~ s/[\x{FEC1}\x{FEC3}\x{FEC4}\x{FEC2}]+/\x{0637}/g;
+    $s =~ s/[\x{FE85}\x{FE86}]+/\x{0624}/g;
+    $s =~ s/[\x{FEA5}\x{FEA7}\x{FEA8}\x{FEA6}]+/\x{062E}/g;
+    $s =~ s/[\x{FBD9}\x{FBDA}]+/\x{06C6}/g;
+    $s =~ s/[\x{FE8F}\x{FE91}\x{FE92}\x{FE90}]+/\x{0628}/g;
+    $s =~ s/[\x{FEED}\x{FEEE}]+/\x{0648}/g;
+    $s =~ s/[\x{FE99}\x{FE9B}\x{FE9C}\x{FE9A}]+/\x{062B}/g;
+    $s =~ s/[\x{FEBD}\x{FEBF}\x{FEC0}\x{FEBE}]+/\x{0636}/g;
+    $s =~ s/[\x{FEE5}\x{FEE7}\x{FEE8}\x{FEE6}]+/\x{0646}/g;
+    $s =~ s/[\x{FBFC}\x{FBFE}\x{FBFF}\x{FBFD}]+/\x{06CC}/g;
+    $s =~ s/[\x{FBA4}\x{FBA5}]+/\x{06C0}/g;
+    $s =~ s/[\x{FB72}\x{FB74}\x{FB75}\x{FB73}]+/\x{0684}/g;
+    $s =~ s/[\x{FBD3}\x{FBD5}\x{FBD6}\x{FBD4}]+/\x{06AD}/g;
+    $s =~ s/[\x{FB6A}\x{FB6C}\x{FB6D}\x{FB6B}]+/\x{06A4}/g;
+    $s =~ s/[\x{FB66}\x{FB68}\x{FB69}\x{FB67}]+/\x{0679}/g;
+    $s =~ s/[\x{FB5E}\x{FB60}\x{FB61}\x{FB5F}]+/\x{067A}/g;
+    $s =~ s/[\x{FB88}\x{FB89}]+/\x{0688}/g;
+    $s =~ s/[\x{FB7E}\x{FB80}\x{FB81}\x{FB7F}]+/\x{0687}/g;
+    $s =~ s/[\x{FB8E}\x{FB90}\x{FB91}\x{FB8F}]+/\x{06A9}/g;
+    $s =~ s/[\x{FB86}\x{FB87}]+/\x{068E}/g;
+    $s =~ s/[\x{FE83}\x{FE84}]+/\x{0623}/g;
+    $s =~ s/[\x{FB8A}\x{FB8B}]+/\x{0698}/g;
+    $s =~ s/[\x{FED5}\x{FED7}\x{FED8}\x{FED6}]+/\x{0642}/g;
+    $s =~ s/[\x{FED9}\x{FEDB}\x{FEDC}\x{FEDA}]+/\x{0643}/g;
+    $s =~ s/[\x{FBE0}\x{FBE1}]+/\x{06C5}/g;
+    $s =~ s/[\x{FEB9}\x{FEBB}\x{FEBC}\x{FEBA}]+/\x{0635}/g;
+    $s =~ s/[\x{FEC5}\x{FEC7}\x{FEC8}\x{FEC6}]+/\x{0638}/g;
+    $s =~ s/[\x{FE8D}\x{FE8E}]+/\x{0627}/g;
+    $s =~ s/[\x{FB9A}\x{FB9C}\x{FB9D}\x{FB9B}]+/\x{06B1}/g;
+    $s =~ s/[\x{FEAD}\x{FEAE}]+/\x{0631}/g;
+    $s =~ s/[\x{FEF1}\x{FEF3}\x{FEF4}\x{FEF2}]+/\x{064A}/g;
+    $s =~ s/[\x{FE93}\x{FE94}]+/\x{0629}/g;
+    $s =~ s/[\x{FBE4}\x{FBE6}\x{FBE7}\x{FBE5}]+/\x{06D0}/g;
+    $s =~ s/[\x{FE89}\x{FE8B}\x{FE8C}\x{FE8A}]+/\x{0626}/g;
+    $s =~ s/[\x{FB84}\x{FB85}]+/\x{068C}/g;
+    $s =~ s/[\x{FE9D}\x{FE9F}\x{FEA0}\x{FE9E}]+/\x{062C}/g;
+    $s =~ s/[\x{FB82}\x{FB83}]+/\x{068D}/g;
+    $s =~ s/[\x{FEA1}\x{FEA3}\x{FEA4}\x{FEA2}]+/\x{062D}/g;
+    $s =~ s/[\x{FB52}\x{FB54}\x{FB55}\x{FB53}]+/\x{067B}/g;
+    $s =~ s/[\x{FB92}\x{FB94}\x{FB95}\x{FB93}]+/\x{06AF}/g;
+    $s =~ s/[\x{FB7A}\x{FB7C}\x{FB7D}\x{FB7B}]+/\x{0686}/g;
+    $s =~ s/[\x{FBDB}\x{FBDC}]+/\x{06C8}/g;
+    $s =~ s/[\x{FB56}\x{FB58}\x{FB59}\x{FB57}]+/\x{067E}/g;
+    $s =~ s/[\x{FEB5}\x{FEB7}\x{FEB8}\x{FEB6}]+/\x{0634}/g;
+    $s =~ s/[\x{FBE2}\x{FBE3}]+/\x{06C9}/g;
+    $s =~ s/[\x{FB96}\x{FB98}\x{FB99}\x{FB97}]+/\x{06B3}/g;
+    $s =~ s/[\x{FE80}]+/\x{0621}/g;
+    $s =~ s/[\x{FBAE}\x{FBAF}]+/\x{06D2}/g;
+    $s =~ s/[\x{FB62}\x{FB64}\x{FB65}\x{FB63}]+/\x{067F}/g;
+    $s =~ s/[\x{FEE9}\x{FEEB}\x{FEEC}\x{FEEA}]+/\x{0647}/g;
+    $s =~ s/[\x{FE81}\x{FE82}]+/\x{0622}/g;
+    $s =~ s/[\x{FBDE}\x{FBDF}]+/\x{06CB}/g;
+    $s =~ s/[\x{FE87}\x{FE88}]+/\x{0625}/g;
+    $s =~ s/[\x{FB6E}\x{FB70}\x{FB71}\x{FB6F}]+/\x{06A6}/g;
+    $s =~ s/[\x{FBA0}\x{FBA2}\x{FBA3}\x{FBA1}]+/\x{06BB}/g;
+    $s =~ s/[\x{FBAA}\x{FBAC}\x{FBAD}\x{FBAB}]+/\x{06BE}/g;
+    $s =~ s/[\x{FEA9}\x{FEAA}]+/\x{062F}/g;
+    $s =~ s/[\x{FEE1}\x{FEE3}\x{FEE4}\x{FEE2}]+/\x{0645}/g;
+    $s =~ s/[\x{FEEF}\x{FBE8}\x{FBE9}\x{FEF0}]+/\x{0649}/g;
+    $s =~ s/[\x{FB8C}\x{FB8D}]+/\x{0691}/g;
+    $s =~ s/[\x{FB76}\x{FB78}\x{FB79}\x{FB77}]+/\x{0683}/g;
+    $s =~ s/[\x{FB5A}\x{FB5C}\x{FB5D}\x{FB5B}]+/\x{0680}/g;
+    $s =~ s/[\x{FB9E}\x{FB9F}]+/\x{06BA}/g;
+    $s =~ s/[\x{FEC9}\x{FECB}\x{FECC}\x{FECA}]+/\x{0639}/g;
+    $s =~ s/[\x{FEDD}\x{FEDF}\x{FEE0}\x{FEDE}]+/\x{0644}/g;
+    $s =~ s/[\x{FB50}\x{FB51}]+/\x{0671}/g;
+    $s =~ s/[\x{FEB1}\x{FEB3}\x{FEB4}\x{FEB2}]+/\x{0633}/g;
+    $s =~ s/[\x{FE95}\x{FE97}\x{FE98}\x{FE96}]+/\x{062A}/g;
+    $s =~ s/[\x{FBD7}\x{FBD8}]+/\x{06C7}/g;
+    $s =~ s/[\x{FEAF}\x{FEB0}]+/\x{0632}/g;
+    $s =~ s/[\x{FEAB}\x{FEAC}]+/\x{0630}/g;
+
+    # Remove tatweel
+    $s =~ s/\x{0640}//g;
+    # Remove vowels and hamza
+    $s =~ s/[\x{064B}-\x{0655}]+//g;
+    # Remove right-to-left and left-to-right
+    $s =~ s/[\x{200F}\x{200E}]+//g;
+    # Arabic Keheh to Arabic Kaf
+    $s =~ s/\x{06A9}/\x{0643}/g;
+    # Arabic Yeh to Farsi Yeh
+    $s =~ s/\x{064A}/\x{06CC}/g;
+    # Decompose RIAL
+    $s =~ s/\x{FDFC}/\x{0631}\x{06CC}\x{0627}\x{0644}/g;
+    # Farsi arabic-indic digits to arabic-indic digits
+    $s =~ s/\x{06F0}/\x{0660}/g;
+    $s =~ s/\x{06F1}/\x{0661}/g;
+    $s =~ s/\x{06F2}/\x{0662}/g;
+    $s =~ s/\x{06F3}/\x{0663}/g;
+    $s =~ s/\x{06F4}/\x{0664}/g;
+    $s =~ s/\x{06F5}/\x{0665}/g;
+    $s =~ s/\x{06F6}/\x{0666}/g;
+    $s =~ s/\x{06F7}/\x{0667}/g;
+    $s =~ s/\x{06F8}/\x{0668}/g;
+    $s =~ s/\x{06F9}/\x{0669}/g;
+    # Arabic-indic digits to digits
+    $s =~ s/\x{0660}/0/g;
+    $s =~ s/\x{0661}/1/g;
+    $s =~ s/\x{0662}/2/g;
+    $s =~ s/\x{0663}/3/g;
+    $s =~ s/\x{0664}/4/g;
+    $s =~ s/\x{0665}/5/g;
+    $s =~ s/\x{0666}/6/g;
+    $s =~ s/\x{0667}/7/g;
+    $s =~ s/\x{0668}/8/g;
+    $s =~ s/\x{0669}/9/g;
+    # Arabic comma to comma
+    $s =~ s/\x{060C}/\x{002C}/g;
+
+    $s =~ s/\|/ /g;
+    if ($s ne "") {
+      print "$s";
+    } else {
+      print "";
+    }
+  }
+  print "\n";
+}
+
diff --git a/egs/yomdle_zh/v1/local/yomdle2csv.py b/egs/yomdle_zh/v1/local/yomdle2csv.py
new file mode 100755
index 00000000000..3641de90324
--- /dev/null
+++ b/egs/yomdle_zh/v1/local/yomdle2csv.py
@@ -0,0 +1,227 @@
+#!/usr/bin/env python3
+
+"""
+GEDI2CSV
+Convert GEDI-type bounding boxes to CSV format
+
+GEDI Format Example:
+<GEDI xmlns= GEDI_version= GEDI_date=>
+    <USER name= date= dateFormat="mm/dd/yyyy hh:mm"> </USER>
+    <DL_DOCUMENT src= NrOfPages= docTag=>
+        <DL_PAGE gedi_type= src= pageID= width= height=>
+            <DL_ZONE gedi_type= id=  Illegible= polygon=  Language= Text_Content= text_raw=> </DL_ZONE>
+        </DL_PAGE>
+    </DL_DOCUMENT>
+</GEDI>
+
+CSV Format Example
+ID,name,col1,row1,col2,row2,col3,row3,col4,row4,confidence,truth,pgrot,bbrot,qual,script,lang
+0,chinese_scanned_books_0001_0.png,99,41,99,14,754,14,754,41,100,凡我的邻人说是好的，有一大部分在我灵魂中却,0,0.0,0,,zh-cn
+"""
+
+import logging
+import os
+import sys
+import time
+import glob
+import csv
+import imghdr
+from PIL import Image
+import argparse
+import pdb
+import cv2
+import numpy as np
+import xml.etree.ElementTree as ET
+
+sin = np.sin
+cos = np.cos
+pi = np.pi
+
+def Rotate2D(pts, cnt, ang=90):
+    M = np.array([[cos(ang),-sin(ang)],[sin(ang),cos(ang)]])
+    res = np.dot(pts-cnt,M)+cnt
+    return M, res
+
+def npbox2string(npar):
+    if np.shape(npar)[0] != 1:
+        print('Error during CSV conversion\n')
+    c1,r1 = npar[0][0],npar[0][1]
+    c2,r2 = npar[0][2],npar[0][3]
+    c3,r3 = npar[0][4],npar[0][5]
+    c4,r4 = npar[0][6],npar[0][7]
+
+    return c1,r1,c2,r2,c3,r3,c4,r4
+
+# cv2.minAreaRect() returns a Box2D structure which contains following detals - ( center (x,y), (width, height), angle of rotation )
+# Get 4 corners of the rectangle using cv2.boxPoints()
+
+class GEDI2CSV():
+
+    """ Initialize the extractor"""
+    def __init__(self, logger, args):
+        self._logger = logger
+        self._args = args
+
+    """
+    Segment image with GEDI bounding box information
+    """
+    def csvfile(self, coords, polys, baseName, pgrot):
+
+        """ for writing the files """
+        writePath = self._args.outputDir
+        if os.path.isdir(writePath) != True:
+            os.makedirs(writePath)
+
+        rotlist = []
+
+        header=['ID','name','col1','row1','col2','row2','col3','row3','col4','row4','confidence','truth','pgrot','bbrot','qual','script','lang']
+        conf=100
+        pgrot = 0
+        bbrot = 0
+        qual = 0
+        script = ''
+
+        write_ctr = 0
+        if len(coords) == 0 and len(polys) == 0:
+            self._logger.info('Found %s with no text content',(baseName))
+            print('...Found %s with no text content' % (baseName))
+            return
+
+        strPos = writePath + baseName
+
+        for j in polys:
+            try:
+                arr = []
+                [id,poly_val,text,qual,lang] = j
+                script=None
+                #print(j)
+                for i in poly_val:
+                    if len(i.strip()) > 0:
+                        #print(i)
+                        arr.append(eval(i))
+
+                contour = np.asarray(arr)
+                #print(contour)
+                convex = cv2.convexHull(contour)
+                rect = cv2.minAreaRect(convex)
+                box = cv2.boxPoints(rect)
+                box = np.int0(box)
+                box = np.reshape(box,(-1,1)).T
+                c1,r1,c2,r2,c3,r3,c4,r4 = npbox2string(box)
+
+                bbrot = 0.0
+
+                rotlist.append([id,baseName + '_' + id + '.png',c1,r1,c2,r2,c3,r3,c4,r4,conf,text,pgrot,bbrot,qual,script,lang])
+
+            except:
+                print('...polygon error %s, %s' % (j, baseName))
+                continue
+
+        # then write out all of list to file
+        with open(strPos + ".csv", "w", encoding="utf-8") as f:
+            writer = csv.writer(f)
+            writer.writerow(header)
+            for row in rotlist:
+                writer.writerow(row)
+                write_ctr += 1
+
+        return write_ctr
+
+
+def main(args):
+
+    startTime = time.clock()
+
+    writePath = args.outputDir
+    print('write to %s' % (writePath))
+    if os.path.isdir(writePath) != True:
+        os.makedirs(writePath)
+
+    """ Setup logging """
+    logger = logging.getLogger(__name__)
+    logger.setLevel(logging.INFO)
+    if args.log:
+        handler = logging.FileHandler(args.log)
+        handler.setLevel(logging.INFO)
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    gtconverter = GEDI2CSV(logger, args)
+    namespaces = {"gedi" : "http://lamp.cfar.umd.edu/media/projects/GEDI/"}
+    keyCnt=0
+
+    fileCnt = 0
+    line_write_ctr = 0
+    line_error_ctr = 0
+    file_error_ctr = 0
+    """
+    Get all XML files in the directory and sub folders
+    """
+    print('reading %s' % (args.inputDir))
+    for root, dirnames, filenames in os.walk(args.inputDir, followlinks=True):
+        for file in filenames:
+            if file.lower().endswith('.xml'):
+                fullName = os.path.join(root,file)
+                baseName = os.path.splitext(fullName)
+
+                fileCnt += 1
+
+                try:
+                    """ read the XML file """
+                    tree = ET.parse(fullName)
+                except:
+                    print('...ERROR parsing %s' % (fullName))
+                    file_error_ctr += 1
+                    continue
+
+                gedi_root = tree.getroot()
+                child = gedi_root.findall('gedi:DL_DOCUMENT',namespaces)[0]
+                totalpages = int(child.attrib['NrOfPages'])
+                coordinates=[]
+                polygons = []
+
+                """ and for each page """
+                for i, pgs in enumerate(child.iterfind('gedi:DL_PAGE',namespaces)):
+
+                    if 'GEDI_orientation' not in pgs.attrib:
+                        pageRot=0
+                    else:
+                        pageRot = int(pgs.attrib['GEDI_orientation'])
+                        logger.info(' PAGE ROTATION %s, %s' % (fullName, str(pageRot)))
+
+                    """ find children for each page """
+                    for zone in pgs.findall('gedi:DL_ZONE',namespaces):
+
+                        if zone.attrib['gedi_type']=='Text' :
+                            if zone.get('polygon'):
+                                keyCnt+=1
+                                polygons.append([zone.attrib['id'],zone.get('polygon').split(';'),
+                                                 zone.get('Text_Content'),zone.get('Illegible'),zone.get('Language')])
+                            else:
+                                print('...Not polygon')
+
+
+                if len(coordinates) > 0 or len(polygons) > 0:
+                    line_write_ctr += gtconverter.csvfile(coordinates, polygons, os.path.splitext(file)[0], pageRot)
+                else:
+                    print('...%s has no text content' % (baseName[0]))
+
+
+    print('complete...total files %d, lines written %d, img errors %d, line error %d' % (fileCnt, line_write_ctr, file_error_ctr, line_error_ctr))
+
+
+def parse_arguments(argv):
+    """ Args and defaults """
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--inputDir', type=str, help='Input directory', default='/data/YOMDLE/final_arabic/xml')
+    parser.add_argument('--outputDir', type=str, help='Output directory', default='/exp/YOMDLE/final_arabic/csv_truth/')
+    parser.add_argument('--log', type=str, help='Log directory', default='/exp/logs.txt')
+
+    return parser.parse_args(argv)
+
+
+if __name__ == '__main__':
+    """ Run """
+    main(parse_arguments(sys.argv[1:]))
diff --git a/egs/yomdle_zh/v1/path.sh b/egs/yomdle_zh/v1/path.sh
new file mode 100644
index 00000000000..2d17b17a84a
--- /dev/null
+++ b/egs/yomdle_zh/v1/path.sh
@@ -0,0 +1,6 @@
+export KALDI_ROOT=`pwd`/../../..
+[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
+export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
+[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1
+. $KALDI_ROOT/tools/config/common_path.sh
+export LC_ALL=C
diff --git a/egs/yomdle_zh/v1/run.sh b/egs/yomdle_zh/v1/run.sh
new file mode 100755
index 00000000000..7e6aab56806
--- /dev/null
+++ b/egs/yomdle_zh/v1/run.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+set -e
+stage=0
+nj=60
+
+database_slam=/export/corpora5/slam/SLAM/Chinese/transcribed
+database_yomdle=/export/corpora5/slam/YOMDLE/final_chinese
+download_dir=data_yomdle_chinese/download/
+extra_lm=download/extra_lm.txt
+data_dir=data_yomdle_chinese
+exp_dir=exp_yomdle_chinese
+
+. ./cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if [ $stage -le -1 ]; then
+    local/create_download.sh --database-slam $database_slam \
+        --database-yomdle $database_yomdle \
+        --slam-dir download/slam_chinese \
+        --yomdle-dir download/yomdle_chinese
+fi
+
+if [ $stage -le 0 ]; then
+    mkdir -p data_slam_chinese/slam
+    mkdir -p data_yomdle_chinese/yomdle
+    local/process_data.py download/slam_chinese data_slam_chinese/slam
+    local/process_data.py download/yomdle_chinese data_yomdle_chinese/yomdle
+    ln -s ../data_slam_chinese/slam ${data_dir}/test
+    ln -s ../data_yomdle_chinese/yomdle ${data_dir}/train
+    image/fix_data_dir.sh ${data_dir}/test
+    image/fix_data_dir.sh ${data_dir}/train
+fi
+
+mkdir -p $data_dir/{train,test}/data
+if [ $stage -le 1 ]; then
+    echo "$0: Obtaining image groups. calling get_image2num_frames"
+    echo "Date: $(date)."
+    image/get_image2num_frames.py --feat-dim 60 $data_dir/train
+    image/get_allowed_lengths.py --frame-subsampling-factor 4 10 $data_dir/train
+
+    for datasplit in train test; do
+        echo "$0: Extracting features and calling compute_cmvn_stats for dataset: $datasplit. "
+        echo "Date: $(date)."
+        local/extract_features.sh --nj $nj --cmd "$cmd" \
+            --feat-dim 60 --num-channels 3 \
+            $data_dir/${datasplit}
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit} || exit 1;
+    done
+
+    echo "$0: Fixing data directory for train dataset"
+    echo "Date: $(date)."
+    utils/fix_data_dir.sh $data_dir/train
+fi
+
+if [ $stage -le 2 ]; then
+    for datasplit in train; do
+        echo "$(date) stage 2: Performing augmentation, it will double training data"
+        local/augment_data.sh --nj $nj --cmd "$cmd" --feat-dim 60 $data_dir/${datasplit} $data_dir/${datasplit}_aug $data_dir
+        steps/compute_cmvn_stats.sh $data_dir/${datasplit}_aug || exit 1;
+    done
+fi
+
+if [ $stage -le 3 ]; then
+    echo "$0: Preparing dictionary and lang..."
+    if [ ! -f $data_dir/train/bpe.out ]; then
+        cut -d' ' -f2- $data_dir/train/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/learn_bpe.py -s 700 > $data_dir/train/bpe.out
+        for datasplit in test train train_aug; do
+            cut -d' ' -f1 $data_dir/$datasplit/text > $data_dir/$datasplit/ids
+            cut -d' ' -f2- $data_dir/$datasplit/text | utils/lang/bpe/prepend_words.py | python3 utils/lang/bpe/apply_bpe.py -c $data_dir/train/bpe.out | sed 's/@@//g' > $data_dir/$datasplit/bpe_text
+            mv $data_dir/$datasplit/text $data_dir/$datasplit/text.old
+            paste -d' ' $data_dir/$datasplit/ids $data_dir/$datasplit/bpe_text > $data_dir/$datasplit/text
+        done
+    fi
+
+    local/prepare_dict.sh --data-dir $data_dir --dir $data_dir/local/dict
+    # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
+    # So we set --sil-prob to 0.0
+    utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
+        $data_dir/local/dict "<sil>" $data_dir/lang/temp $data_dir/lang
+    utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 $data_dir/lang
+fi
+
+if [ $stage -le 4 ]; then
+    echo "$0: Estimating a language model for decoding..."
+    local/train_lm.sh --data-dir $data_dir  --dir $data_dir/local/local_lm
+    utils/format_lm.sh $data_dir/lang $data_dir/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+        $data_dir/local/dict/lexicon.txt $data_dir/lang_test
+fi
+
+if [ $stage -le 5 ]; then
+    echo "$0: Calling the flat-start chain recipe..."
+    echo "Date: $(date)." 
+    local/chain/run_flatstart_cnn1a.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 6 ]; then
+    echo "$0: Aligning the training data using the e2e chain model..."
+    echo "Date: $(date)."
+    steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
+        --scale-opts '--transition-scale=1.0 --acoustic-scale=1.0 --self-loop-scale=1.0' \
+        $data_dir/train_aug $data_dir/lang $exp_dir/chain/e2e_cnn_1a $exp_dir/chain/e2e_ali_train
+fi
+
+if [ $stage -le 7 ]; then
+    echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
+    echo "Date: $(date)."
+    local/chain/run_cnn_e2eali_1b.sh --nj $nj --train-set train_aug --data-dir $data_dir --exp-dir $exp_dir
+fi
+
+if [ $stage -le 8 ]; then
+    echo "$0: Estimating a language model for lattice rescoring...$(date)"
+    local/train_lm_lr.sh --data-dir $data_dir  --dir $data_dir/local/local_lm_lr --extra-lm $extra_lm --order 6
+
+    utils/build_const_arpa_lm.sh $data_dir/local/local_lm_lr/data/arpa/6gram_unpruned.arpa.gz \
+        $data_dir/lang_test $data_dir/lang_test_lr
+    steps/lmrescore_const_arpa.sh $data_dir/lang_test $data_dir/lang_test_lr \
+        $data_dir/test $exp_dir/chain/cnn_e2eali_1b/decode_test $exp_dir/chain/cnn_e2eali_1b/decode_test_lr
+fi
diff --git a/egs/yomdle_zh/v1/steps b/egs/yomdle_zh/v1/steps
new file mode 120000
index 00000000000..1b186770dd1
--- /dev/null
+++ b/egs/yomdle_zh/v1/steps
@@ -0,0 +1 @@
+../../wsj/s5/steps/
\ No newline at end of file
diff --git a/egs/yomdle_zh/v1/utils b/egs/yomdle_zh/v1/utils
new file mode 120000
index 00000000000..a3279dc8679
--- /dev/null
+++ b/egs/yomdle_zh/v1/utils
@@ -0,0 +1 @@
+../../wsj/s5/utils/
\ No newline at end of file