diff --git a/egs/madcat_ar/v1/local/chain/run_cnn.sh b/egs/madcat_ar/v1/local/chain/run_cnn.sh
new file mode 120000
index 00000000000..df6f0a468c1
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn.sh
@@ -0,0 +1 @@
+tuning/run_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh
new file mode 120000
index 00000000000..a864819f542
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn_chainali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_chainali_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh
new file mode 120000
index 00000000000..fcf59f917c1
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_cnn_e2eali.sh
@@ -0,0 +1 @@
+tuning/run_cnn_e2eali_1b.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh
new file mode 120000
index 00000000000..d26ba0182ce
--- /dev/null
+++ b/egs/madcat_ar/v1/local/chain/run_e2e_cnn.sh
@@ -0,0 +1 @@
+tuning/run_e2e_cnn_1a.sh
\ No newline at end of file
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
similarity index 100%
rename from egs/madcat_ar/v1/local/chain/run_cnn_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_1a.sh
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
similarity index 100%
rename from egs/madcat_ar/v1/local/chain/run_cnn_chainali_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_chainali_1a.sh
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
similarity index 100%
rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1a.sh
diff --git a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
similarity index 99%
rename from egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
index 75c246f5ffe..55df0cad4b7 100755
--- a/egs/madcat_ar/v1/local/chain/run_cnn_e2eali_1b.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_cnn_e2eali_1b.sh
@@ -193,7 +193,7 @@ if [ $stage -le 5 ]; then
     --trainer.srand=$srand \
     --trainer.max-param-change=2.0 \
     --trainer.num-epochs=4 \
-    --trainer.frames-per-iter=1000000 \
+    --trainer.frames-per-iter=2000000 \
     --trainer.optimization.num-jobs-initial=3 \
     --trainer.optimization.num-jobs-final=16 \
     --trainer.optimization.initial-effective-lrate=0.001 \
diff --git a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
similarity index 93%
rename from egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh
rename to egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
index 2c85e982ce6..033cb88df10 100755
--- a/egs/madcat_ar/v1/local/chain/run_flatstart_cnn1a.sh
+++ b/egs/madcat_ar/v1/local/chain/tuning/run_e2e_cnn_1a.sh
@@ -5,16 +5,16 @@
 
 # local/chain/compare_wer.sh exp/chain/e2e_cnn_1a
 # System                      e2e_cnn_1a
-# WER                             10.71
-# CER                              2.85
-# Final train prob              -0.0859
-# Final valid prob              -0.1266
+# WER                              7.81
+# CER                              2.05
+# Final train prob              -0.0812
+# Final valid prob              -0.0708
 # Final train prob (xent)
 # Final valid prob (xent)
 # Parameters                      2.94M
 
 # steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a/
-# exp/chain/e2e_cnn_1a/: num-iters=195 nj=6..16 num-params=2.9M dim=40->324 combine=-0.065->-0.064 (over 5) logprob:train/valid[129,194,final]=(-0.078,-0.077,-0.086/-0.129,-0.126,-0.127)
+# exp/chain/e2e_cnn_1a/: num-iters=98 nj=6..16 num-params=2.9M dim=40->330 combine=-0.073->-0.073 (over 2) logprob:train/valid[64,97,final]=(-0.084,-0.080,-0.081/-0.073,-0.070,-0.071)
 
 set -e
 
@@ -33,7 +33,7 @@ num_jobs_final=16
 minibatch_size=150=128,64/300=128,64/600=64,32/1200=32,16
 common_egs_dir=
 l2_regularize=0.00005
-frames_per_iter=1000000
+frames_per_iter=2000000
 cmvn_opts="--norm-means=true --norm-vars=true"
 train_set=train
 lang_test=lang_test
@@ -125,6 +125,7 @@ if [ $stage -le 3 ]; then
     --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \
     --chain.frame-subsampling-factor 4 \
     --chain.alignment-subsampling-factor 4 \
+    --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \
     --trainer.add-option="--optimization.memory-compression-level=2" \
     --trainer.num-chunk-per-minibatch $minibatch_size \
     --trainer.frames-per-iter $frames_per_iter \
diff --git a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
index ba35f8b9ace..34e339f1877 100755
--- a/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
+++ b/egs/madcat_ar/v1/local/create_line_image_from_page_image.py
@@ -60,6 +60,8 @@
                     help='Path to the downloaded (and extracted) writing conditions file 3')
 parser.add_argument('--padding', type=int, default=400,
                     help='padding across horizontal/verticle direction')
+parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="only processes subset of data based on writing condition")
 args = parser.parse_args()
 
 """
@@ -209,50 +211,6 @@ def get_orientation(origin, p1, p2):
     return difference
 
 
-def compute_hull(points):
-    """
-    Given input list of points, return a list of points that
-    made up the convex hull.
-    Returns
-    -------
-    [(float, float)]: convexhull points
-    """
-    hull_points = []
-    start = points[0]
-    min_x = start[0]
-    for p in points[1:]:
-        if p[0] < min_x:
-            min_x = p[0]
-            start = p
-
-    point = start
-    hull_points.append(start)
-
-    far_point = None
-    while far_point is not start:
-        p1 = None
-        for p in points:
-            if p is point:
-                continue
-            else:
-                p1 = p
-                break
-
-        far_point = p1
-
-        for p2 in points:
-            if p2 is point or p2 is p1:
-                continue
-            else:
-                direction = get_orientation(point, far_point, p2)
-                if direction > 0:
-                    far_point = p2
-
-        hull_points.append(far_point)
-        point = far_point
-    return hull_points
-
-
 def minimum_bounding_box(points):
     """ Given a list of 2D points, it returns the minimum area rectangle bounding all
         the points in the point cloud.
@@ -272,7 +230,6 @@ def minimum_bounding_box(points):
 
     hull_ordered = [points[index] for index in ConvexHull(points).vertices]
     hull_ordered.append(hull_ordered[0])
-    #hull_ordered = compute_hull(points)
     hull_ordered = tuple(hull_ordered)
 
     min_rectangle = bounding_area(0, hull_ordered)
@@ -535,16 +492,14 @@ def check_writing_condition(wc_dict, base_name):
     Returns
     (bool): True if writing condition matches.
     """
-    return True
-    writing_condition = wc_dict[base_name].strip()
-    if writing_condition != 'IUC':
-        return False
-
-    return True
-
+    if args.subset:
+        writing_condition = wc_dict[base_name].strip()
+        if writing_condition != 'IUC':
+            return False
+    else:
+        return True
 
 ### main ###
-
 def main():
 
     wc_dict1 = parse_writing_conditions(args.writing_condition1)
@@ -564,8 +519,7 @@ def main():
             madcat_file_path, image_file_path, wc_dict = check_file_location(base_name, wc_dict1, wc_dict2, wc_dict3)
             if wc_dict is None or not check_writing_condition(wc_dict, base_name):
                 continue
-            if madcat_file_path is not None:
-                get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh)
+            get_line_images_from_page_image(image_file_path, madcat_file_path, image_fh)
 
 
 if __name__ == '__main__':
diff --git a/egs/madcat_ar/v1/local/extract_features.sh b/egs/madcat_ar/v1/local/extract_features.sh
index 70c5498626c..56a8443e328 100755
--- a/egs/madcat_ar/v1/local/extract_features.sh
+++ b/egs/madcat_ar/v1/local/extract_features.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
+
 # Copyright   2017 Yiwen Shao
 #             2018 Ashish Arora
 
+# Apache 2.0
+# This script runs the make features script in parallel. 
+
 nj=4
 cmd=run.pl
 feat_dim=40
diff --git a/egs/madcat_ar/v1/local/process_data.py b/egs/madcat_ar/v1/local/process_data.py
index b57500cf2fa..920cb6f700b 100755
--- a/egs/madcat_ar/v1/local/process_data.py
+++ b/egs/madcat_ar/v1/local/process_data.py
@@ -42,6 +42,8 @@
                     help='Path to the downloaded (and extracted) writing conditions file 2')
 parser.add_argument('writing_condition3', type=str,
                     help='Path to the downloaded (and extracted) writing conditions file 3')
+parser.add_argument("--subset", type=lambda x: (str(x).lower()=='true'), default=False,
+                   help="only processes subset of data based on writing condition")
 args = parser.parse_args()
 
 
@@ -97,50 +99,40 @@ def check_writing_condition(wc_dict):
     Returns:
         (bool): True if writing condition matches.
     """
-    return True
-    writing_condition = wc_dict[base_name].strip()
-    if writing_condition != 'IUC':
-        return False
+    if args.subset:
+        writing_condition = wc_dict[base_name].strip()
+        if writing_condition != 'IUC':
+            return False
+    else:
+        return True
 
-    return True
 
-
-def get_word_line_mapping(madcat_file_path):
+def read_text(madcat_file_path):
     """ Maps every word in the page image to a  corresponding line.
     Args:
-         madcat_file_path (string): complete path and name of the madcat xml file
+        madcat_file_path (string): complete path and name of the madcat xml file
                                   corresponding to the page image.
     Returns:
+        dict: Mapping every word in the page image to a  corresponding line.
     """
+
+    word_line_dict = dict()
     doc = minidom.parse(madcat_file_path)
     zone = doc.getElementsByTagName('zone')
     for node in zone:
         line_id = node.getAttribute('id')
-        line_word_dict[line_id] = list()
         word_image = node.getElementsByTagName('token-image')
         for tnode in word_image:
             word_id = tnode.getAttribute('id')
-            line_word_dict[line_id].append(word_id)
             word_line_dict[word_id] = line_id
 
-
-def read_text(madcat_file_path):
-    """ Maps every word in the page image to a  corresponding line.
-    Args:
-        madcat_file_path (string): complete path and name of the madcat xml file
-                                  corresponding to the page image.
-    Returns:
-        dict: Mapping every word in the page image to a  corresponding line.
-    """
     text_line_word_dict = dict()
-    doc = minidom.parse(madcat_file_path)
     segment = doc.getElementsByTagName('segment')
     for node in segment:
         token = node.getElementsByTagName('token')
         for tnode in token:
             ref_word_id = tnode.getAttribute('ref_id')
             word = tnode.getElementsByTagName('source')[0].firstChild.nodeValue
-            word = unicodedata.normalize('NFKC',word)
             ref_line_id = word_line_dict[ref_word_id]
             if ref_line_id not in text_line_word_dict:
                 text_line_word_dict[ref_line_id] = list()
@@ -160,7 +152,6 @@ def get_line_image_location():
 
 
 ### main ###
-
 print("Processing '{}' data...".format(args.out_dir))
 
 text_file = os.path.join(args.out_dir, 'text')
@@ -188,24 +179,19 @@ def get_line_image_location():
             madcat_xml_path, image_file_path, wc_dict = check_file_location()
             if wc_dict is None or not check_writing_condition(wc_dict):
                 continue
-            if madcat_xml_path is not None:
-                madcat_doc = minidom.parse(madcat_xml_path)
-                writer = madcat_doc.getElementsByTagName('writer')
-                writer_id = writer[0].getAttribute('id')
-                line_word_dict = dict()
-                word_line_dict = dict()
-                get_word_line_mapping(madcat_xml_path)
-                text_line_word_dict = read_text(madcat_xml_path)
-                base_name = os.path.basename(image_file_path)
-                base_name, b = base_name.split('.tif')
-                for lineID in sorted(text_line_word_dict):
-                    updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png'
-                    location = image_loc_dict[updated_base_name]
-                    image_file_path = os.path.join(location, updated_base_name)
-                    line = text_line_word_dict[lineID]
-                    text = ' '.join(line)
-                    utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4)
-                    text_fh.write(utt_id + ' ' + text + '\n')
-                    utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
-                    image_fh.write(utt_id + ' ' + image_file_path + '\n')
-                    image_num += 1
+            madcat_doc = minidom.parse(madcat_xml_path)
+            writer = madcat_doc.getElementsByTagName('writer')
+            writer_id = writer[0].getAttribute('id')
+            text_line_word_dict = read_text(madcat_xml_path)
+            base_name = os.path.basename(image_file_path).split('.tif')[0]
+            for lineID in sorted(text_line_word_dict):
+                updated_base_name = base_name + '_' + str(lineID).zfill(4) +'.png'
+                location = image_loc_dict[updated_base_name]
+                image_file_path = os.path.join(location, updated_base_name)
+                line = text_line_word_dict[lineID]
+                text = ' '.join(line)
+                utt_id = writer_id + '_' + str(image_num).zfill(6) + '_' + base_name + '_' + str(lineID).zfill(4)
+                text_fh.write(utt_id + ' ' + text + '\n')
+                utt2spk_fh.write(utt_id + ' ' + writer_id + '\n')
+                image_fh.write(utt_id + ' ' + image_file_path + '\n')
+                image_num += 1
diff --git a/egs/madcat_ar/v1/local/score.sh b/egs/madcat_ar/v1/local/score.sh
index 2c11aba3e13..31564d25326 100755
--- a/egs/madcat_ar/v1/local/score.sh
+++ b/egs/madcat_ar/v1/local/score.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 
-steps/scoring/score_kaldi_wer.sh --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@"
-steps/scoring/score_kaldi_cer.sh --stage 2 --word_ins_penalty 0.0,0.5,1.0,1.5,2.0,2.5,3.0,3.5,4.0,4.5,5.0,5.5,6.0,6.5,7.0 "$@"
+steps/scoring/score_kaldi_wer.sh "$@"
+steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
diff --git a/egs/madcat_ar/v1/local/train_lm.sh b/egs/madcat_ar/v1/local/train_lm.sh
index 3b8a382cb00..b7fc0b09a46 100755
--- a/egs/madcat_ar/v1/local/train_lm.sh
+++ b/egs/madcat_ar/v1/local/train_lm.sh
@@ -6,20 +6,19 @@
 #           2017  Hossein Hadian
 # Apache 2.0
 #
-# This script trains a LM on the MADCAT training transcriptions.
+# This script trains a LM on the training transcriptions.
 # It is based on the example scripts distributed with PocoLM
 
 # It will check if pocolm is installed and if not will proceed with installation
 
 set -e
 stage=0
-
+dir=data/local/local_lm
+order=6
 echo "$0 $@"  # Print the command line for logging
 . ./utils/parse_options.sh || exit 1;
 
-dir=data/local/local_lm
 lm_dir=${dir}/data
-segments=data/train/segmented_words
 
 
 mkdir -p $dir
@@ -43,12 +42,10 @@ bypass_metaparam_optim_opt=
 # These example numbers of metaparameters is for 4-gram model (with min-counts)
 # running with train_lm.py.
 # The dev perplexity should be close to the non-bypassed model.
-#bypass_metaparam_optim_opt=
 # Note: to use these example parameters, you may need to remove the .done files
 # to make sure the make_lm_dir.py be called and tain only 3-gram model
 #for order in 3; do
 #rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done
-
 if [ $stage -le 0 ]; then
   mkdir -p ${dir}/data
   mkdir -p ${dir}/data/text
@@ -65,7 +62,7 @@ if [ $stage -le 0 ]; then
 
   # use the training data as an additional data source.
   # we can later fold the dev data into this.
-  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/madcat.txt
+  cat data/train/text | cut -d " " -f 2- >  ${dir}/data/text/train.txt
 
   # for reporting perplexities, we'll use the "real" dev set.
   # (the validation data is used as ${dir}/data/text/dev.txt to work
@@ -75,12 +72,10 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from MADCAT text
-  cat ${dir}/data/text/madcat.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  cat ${dir}/data/text/train.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
   cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 
-order=3
-
 if [ $stage -le 1 ]; then
   # decide on the vocabulary.
   # Note: you'd use --wordlist if you had a previously determined word-list
@@ -88,7 +83,7 @@ if [ $stage -le 1 ]; then
   # Note: if you have more than one order, use a certain amount of words as the
   # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  min_counts='train=2 madcat=1'
+  min_counts='train=1'
   wordlist=${dir}/data/wordlist
 
   lm_name="`basename ${wordlist}`_${order}"
@@ -96,13 +91,34 @@ if [ $stage -le 1 ]; then
     lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`"
   fi
   unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm
-  train_lm.py  --wordlist=${wordlist} --num-splits=5 --warm-start-ratio=1 \
+  train_lm.py  --wordlist=${wordlist} --num-splits=20 --warm-start-ratio=20 \
                --limit-unk-history=true \
                ${bypass_metaparam_optim_opt} \
                ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir}
 
   get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity'
-
   mkdir -p ${dir}/data/arpa
   format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz
 fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: pruning the LM (to larger size)"
+  # Using 1 million n-grams for a big LM for rescoring purposes.
+  size=1000000
+  prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity'
+  mkdir -p ${dir}/data/arpa
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz
+fi
+
+if [ $stage -le 3 ]; then
+  echo "$0: pruning the LM (to smaller size)"
+  # Using 500k n-grams for a smaller LM for graph building.  Prune from the
+  # bigger-pruned LM, it'll be faster.
+  size=500000
+  prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small
+
+  get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity'
+  format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz
+fi
diff --git a/egs/madcat_ar/v1/run.sh b/egs/madcat_ar/v1/run.sh
index 14c8bf7a6ce..f6a63320497 100755
--- a/egs/madcat_ar/v1/run.sh
+++ b/egs/madcat_ar/v1/run.sh
@@ -11,9 +11,7 @@ decode_gmm=false
 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # This corpus can be purchased here:
-# https://catalog.ldc.upenn.edu/LDC2012T15,
-# https://catalog.ldc.upenn.edu/LDC2013T09/,
-# https://catalog.ldc.upenn.edu/LDC2013T15/.
+# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
 download_dir1=/export/corpora/LDC/LDC2012T15/data
 download_dir2=/export/corpora/LDC/LDC2013T09/data
 download_dir3=/export/corpora/LDC/LDC2013T15/data
@@ -21,7 +19,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
 writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_splits_dir=data/download/data_splits
-
+overwrite=false
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
 . ./path.sh
@@ -34,8 +32,14 @@ mkdir -p data/{train,test,dev}/data
 mkdir -p data/local/{train,test,dev}
 
 if [ $stage -le 0 ]; then
-  echo "$0: Downloading data splits..."
-  echo "Date: $(date)."
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+
+  echo "$0: Downloading data splits...$(date)"
   local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
                          --download_dir2 $download_dir2 --download_dir3 $download_dir3
 fi
@@ -79,7 +83,7 @@ fi
 if [ $stage -le 5 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
-  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                      data/local/dict/lexicon.txt data/lang_test
 fi
 
@@ -132,9 +136,9 @@ if [ $stage -le 12 ]; then
 fi
 
 if [ $stage -le 13 ]; then
-  local/chain/run_cnn_1a.sh
+  local/chain/run_cnn.sh
 fi
 
 if [ $stage -le 14 ]; then
-  local/chain/run_cnn_chainali_1a.sh --stage 2
+  local/chain/run_cnn_chainali.sh --stage 2
 fi
diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh
index 5d27476d3e1..3986ede9d7f 100755
--- a/egs/madcat_ar/v1/run_end2end.sh
+++ b/egs/madcat_ar/v1/run_end2end.sh
@@ -7,9 +7,7 @@ nj=70
 # download_dir{1,2,3} points to the database path on the JHU grid. If you have not
 # already downloaded the database you can set it to a local directory
 # This corpus can be purchased here:
-# https://catalog.ldc.upenn.edu/LDC2012T15,
-# https://catalog.ldc.upenn.edu/LDC2013T09/,
-# https://catalog.ldc.upenn.edu/LDC2013T15/.
+# https://catalog.ldc.upenn.edu/{LDC2012T15,LDC2013T09/,LDC2013T15/}
 download_dir1=/export/corpora/LDC/LDC2012T15/data
 download_dir2=/export/corpora/LDC/LDC2013T09/data
 download_dir3=/export/corpora/LDC/LDC2013T15/data
@@ -17,6 +15,7 @@ writing_condition1=/export/corpora/LDC/LDC2012T15/docs/writing_conditions.tab
 writing_condition2=/export/corpora/LDC/LDC2013T09/docs/writing_conditions.tab
 writing_condition3=/export/corpora/LDC/LDC2013T15/docs/writing_conditions.tab
 data_splits_dir=data/download/data_splits
+overwrite=false
 
 . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system.
            ## This relates to the queue.
@@ -27,15 +26,17 @@ data_splits_dir=data/download/data_splits
 
 mkdir -p data/{train,test,dev}/data
 mkdir -p data/local/{train,test,dev}
-
 if [ $stage -le 0 ]; then
-  echo "$0: Downloading data splits..."
-  echo "Date: $(date)."
+
+  if [ -f data/train/text ] && ! $overwrite; then
+    echo "$0: Not processing, probably script have run from wrong stage"
+    echo "Exiting with status 1 to avoid data corruption"
+    exit 1;
+  fi
+  echo "$0: Downloading data splits...$(date)"
   local/download_data.sh --data_splits $data_splits_dir --download_dir1 $download_dir1 \
                          --download_dir2 $download_dir2 --download_dir3 $download_dir3
-fi
 
-if [ $stage -le 1 ]; then
   for dataset in test train dev; do
     data_split_file=$data_splits_dir/madcat.$dataset.raw.lineid
     local/extract_lines.sh --nj $nj --cmd $cmd --data_split_file $data_split_file \
@@ -44,9 +45,7 @@ if [ $stage -le 1 ]; then
         --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3 \
         --data data/local/$dataset
   done
-fi
 
-if [ $stage -le 2 ]; then
   echo "$0: Preparing data..."
   local/prepare_data.sh --download_dir1 $download_dir1 --download_dir2 $download_dir2 \
       --download_dir3 $download_dir3 --images_scp_dir data/local \
@@ -54,75 +53,66 @@ if [ $stage -le 2 ]; then
       --writing_condition2 $writing_condition2 --writing_condition3 $writing_condition3
 fi
 
-if [ $stage -le 3 ]; then
-  echo "$0: Obtaining image groups. calling get_image2num_frames"
-  echo "Date: $(date)."
-  image/get_image2num_frames.py data/train  # This will be needed for the next command
-  # The next command creates a "allowed_lengths.txt" file in data/train
-  # which will be used by local/make_features.py to enforce the images to
-  # have allowed lengths. The allowed lengths will be spaced by 10% difference in length.
-  echo "$0: Obtaining image groups. calling get_allowed_lengths"
-  echo "Date: $(date)."
+if [ $stage -le 1 ]; then
+  echo "$0: Obtaining image groups. calling get_image2num_frames $(date)."
+  image/get_image2num_frames.py data/train
   image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train
-fi
 
-if [ $stage -le 4 ]; then
   for dataset in test train; do
-    echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $dataset. "
-    echo "Date: $(date)."
+    echo "$0: Extracting features and calling compute_cmvn_stats for dataset:  $dataset. $(date)"
     local/extract_features.sh --nj $nj --cmd $cmd --feat-dim 40 data/$dataset
     steps/compute_cmvn_stats.sh data/$dataset || exit 1;
   done
-  echo "$0: Fixing data directory for train dataset"
-  echo "Date: $(date)."
+  echo "$0: Fixing data directory for train dataset $(date)."
   utils/fix_data_dir.sh data/train
 fi
 
-if [ $stage -le 5 ]; then
-  echo "$0: Preparing dictionary and lang..."
+if [ $stage -le 2 ]; then
+  echo "$0: Preparing BPE..."
   cut -d' ' -f2- data/train/text | local/reverse.py | \
-    local/prepend_words.py | \
-    utils/lang/bpe/learn_bpe.py -s 700 > data/train/bpe.out
+    utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \
+    utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt
+
   for set in test train dev; do
     cut -d' ' -f1 data/$set/text > data/$set/ids
     cut -d' ' -f2- data/$set/text | local/reverse.py | \
-      local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/train/bpe.out \
+      utils/lang/bpe/prepend_words.py --encoding 'utf-8' | \
+      utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \
       | sed 's/@@//g' > data/$set/bpe_text
+
     mv data/$set/text data/$set/text.old
     paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text
+    rm -f data/$set/bpe_text data/$set/ids
   done
+
+  echo "$0:Preparing dictionary and lang..."
   local/prepare_dict.sh
-  # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations.
-  # So we set --sil-prob to 0.0
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \
                         data/local/dict "<sil>" data/lang/temp data/lang
   utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang
 fi
 
-if [ $stage -le 6 ]; then
+if [ $stage -le 3 ]; then
   echo "$0: Estimating a language model for decoding..."
   local/train_lm.sh
-  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_unpruned.arpa.gz \
+  utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \
                      data/local/dict/lexicon.txt data/lang_test
 fi
 
-if [ $stage -le 7 ]; then
-  echo "$0: Calling the flat-start chain recipe..."
-  echo "Date: $(date)."
-  local/chain/run_flatstart_cnn1a.sh --nj $nj
+if [ $stage -le 4 ]; then
+  echo "$0: Calling the flat-start chain recipe... $(date)."
+  local/chain/run_e2e_cnn.sh --nj $nj
 fi
 
-if [ $stage -le 8 ]; then
-  echo "$0: Aligning the training data using the e2e chain model..."
-  echo "Date: $(date)."
+if [ $stage -le 5 ]; then
+  echo "$0: Aligning the training data using the e2e chain model...$(date)."
   steps/nnet3/align.sh --nj $nj --cmd "$cmd" \
                        --use-gpu false \
                        --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \
                        data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train
 fi
 
-if [ $stage -le 9 ]; then
-  echo "$0: Building a tree and training a regular chain model using the e2e alignments..."
-  echo "Date: $(date)."
-  local/chain/run_cnn_e2eali_1b.sh --nj $nj
+if [ $stage -le 6 ]; then
+  echo "$0: Building a tree and training a regular chain model using the e2e alignments...$(date)"
+  local/chain/run_cnn_e2eali.sh --nj $nj
 fi