diff --git a/egs/iam/v1/local/chain/compare_wer.sh b/egs/iam/v1/local/chain/compare_wer.sh
index 4eb665fc702..ad90710b13f 100755
--- a/egs/iam/v1/local/chain/compare_wer.sh
+++ b/egs/iam/v1/local/chain/compare_wer.sh
@@ -11,6 +11,7 @@ if [ $# == 0 ]; then
   echo "e.g.: $0 exp/chain/cnn{1a,1b}"
   exit 1
 fi
+. ./path.sh
 
 echo "# $0 $*"
 used_epochs=false
@@ -26,6 +27,13 @@ for x in $*; do
 done
 echo
 
+echo -n "# CER                        "
+for x in $*; do
+  cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}')
+  printf "% 10s" $cer
+done
+echo
+
 if $used_epochs; then
   exit 0;  # the diagnostics aren't comparable between regular and discriminatively trained systems.
 fi
@@ -57,3 +65,10 @@ for x in $*; do
   printf "% 10s" $prob
 done
 echo
+
+echo -n "# Parameters                 "
+for x in $*; do
+  params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}')
+  printf "% 10s" $params
+done
+echo
diff --git a/egs/iam/v1/local/chain/run_cnn_1a.sh b/egs/iam/v1/local/chain/run_cnn_1a.sh
index 3b1571091c1..05cb9948bd9 100755
--- a/egs/iam/v1/local/chain/run_cnn_1a.sh
+++ b/egs/iam/v1/local/chain/run_cnn_1a.sh
@@ -7,9 +7,15 @@
 # steps/info/chain_dir_info.pl exp/chain/cnn_1a/
 # exp/chain/cnn_1a/: num-iters=21 nj=2..4 num-params=4.4M dim=40->364 combine=-0.021->-0.015 xent:train/valid[13,20,final]=(-1.05,-0.701,-0.591/-1.30,-1.08,-1.00) logprob:train/valid[13,20,final]=(-0.061,-0.034,-0.030/-0.107,-0.101,-0.098)
 
-# cat exp/chain/cnn_1a/decode_test/scoring_kaldi/best_*
-# %WER 5.94 [ 3913 / 65921, 645 ins, 1466 del, 1802 sub ] exp/chain/cnn_1a/decode_test//cer_11_0.0
-# %WER 9.13 [ 1692 / 18542, 162 ins, 487 del, 1043 sub ] exp/chain/cnn_1a/decode_test/wer_11_0.0
+# local/chain/compare_wer.sh exp/chain/cnn_1a/
+# System                         cnn_1a
+# WER                             18.58
+# CER                             10.17
+# Final train prob              -0.0122
+# Final valid prob              -0.0999
+# Final train prob (xent)       -0.5652
+# Final valid prob (xent)       -0.9758
+# Parameters                      4.36M
 
 set -e -o pipefail
 
@@ -40,7 +46,7 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_test=lang_unk
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
diff --git a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
index ddf596a6126..d6d0ee780f4 100755
--- a/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
+++ b/egs/iam/v1/local/chain/run_cnn_chainali_1b.sh
@@ -1,20 +1,20 @@
 #!/bin/bash
 
 # chainali_1b is as chainali_1a except it has 3 more cnn layers and 1 less tdnn layer.
-# ./local/chain/compare_wer.sh exp/chain/cnn_chainali_1a/ exp/chain/cnn_chainali_1b/
-# System                      cnn_chainali_1a cnn_chainali_1b
-# WER                              6.69     6.25
-# Final train prob              -0.0132   -0.0041
-# Final valid prob              -0.0509   -0.0337
-# Final train prob (xent)       -0.6393   -0.6287
-# Final valid prob (xent)       -1.0116   -0.9064
+
+# local/chain/compare_wer.sh exp/chain/cnn_1a/ exp/chain/cnn_chainali_1b/
+# System                         cnn_1a cnn_chainali_1b
+# WER                             18.58     14.67
+# CER                             10.17      7.31
+# Final train prob              -0.0122    0.0042
+# Final valid prob              -0.0999   -0.0256
+# Final train prob (xent)       -0.5652   -0.6282
+# Final valid prob (xent)       -0.9758   -0.9096
+# Parameters                      4.36M     3.96M
 
 # steps/info/chain_dir_info.pl exp/chain/chainali_cnn_1b/
 # exp/chain/chainali_cnn_1b/: num-iters=21 nj=2..4 num-params=4.0M dim=40->364 combine=-0.009->-0.005 xent:train/valid[13,20,final]=(-1.47,-0.728,-0.623/-1.69,-1.02,-0.940) logprob:train/valid[13,20,final]=(-0.068,-0.030,-0.011/-0.086,-0.056,-0.038)
 
-# cat exp/chain/cnn_chainali_1b/decode_test/scoring_kaldi/best_*
-# %WER 3.94 [ 2600 / 65921, 415 ins, 1285 del, 900 sub ] exp/chain/cnn_chainali_1b/decode_test/cer_10_0.0
-# %WER 6.25 [ 1158 / 18542, 103 ins, 469 del, 586 sub ] exp/chain/cnn_chainali_1b/decode_test/wer_12_0.0
 
 set -e -o pipefail
 
@@ -46,7 +46,7 @@ tdnn_dim=450
 # training options
 srand=0
 remove_egs=false
-lang_test=lang_test
+lang_test=lang_unk
 # End configuration section.
 echo "$0 $@"  # Print the command line for logging
 
diff --git a/egs/iam/v1/local/prepare_dict.sh b/egs/iam/v1/local/prepare_dict.sh
index 0c3bb325023..8b981de3abd 100755
--- a/egs/iam/v1/local/prepare_dict.sh
+++ b/egs/iam/v1/local/prepare_dict.sh
@@ -15,29 +15,27 @@ cat data/train/text | \
   perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \
   sort -u > $dir/nonsilence_phones.txt
 
-# Now list all the unique words (that use only the above letters)
-# in data/train/text and LOB+Brown corpora with their comprising
-# letters as their transcription. (Letter # is replaced with <HASH>)
+# Now use the pocolm's wordlist which is the most N frequent words in
+# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising
+# letters as their transcription. Only include words that use the above letters.
+# (Letter # is replaced with <HASH>)
 
 export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n")
 
-cut -d' ' -f2- data/train/text | \
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt \
-      data/local/browncorpus/brown.txt - | \
+cat data/local/local_lm/data/wordlist | \
   perl -e '$letters=$ENV{letters};
-while(<>){ @A = split;
-  foreach(@A) {
-    if(! $seen{$_} && $_ =~ m/^[$letters]+$/){
-      $seen{$_} = 1;
-      $trans = join(" ", split(//));
+while(<>){
+    chop;
+    $w = $_;
+    if($w =~ m/^[$letters]+$/){
+      $trans = join(" ", split(//, $w));
       $trans =~ s/#/<HASH>/g;
-      print "$_ $trans\n";
+      print "$w $trans\n";
     }
-  }
-}' | sort > $dir/lexicon.txt
+}' | sort -u > $dir/lexicon.txt
 
 
-sed -i '' "s/#/<HASH>/" $dir/nonsilence_phones.txt
+sed -i "s/#/<HASH>/" $dir/nonsilence_phones.txt
 
 echo '<sil> SIL' >> $dir/lexicon.txt
 echo '<unk> SIL' >> $dir/lexicon.txt
diff --git a/egs/iam/v1/local/remove_test_utterances_from_lob.py b/egs/iam/v1/local/remove_test_utterances_from_lob.py
new file mode 100755
index 00000000000..1b414ef47f6
--- /dev/null
+++ b/egs/iam/v1/local/remove_test_utterances_from_lob.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright   2018 Ashish Arora
+
+import argparse
+import os
+import numpy as np
+import sys
+import re
+
+parser = argparse.ArgumentParser(description="""Removes dev/test set lines
+                                                from the LOB corpus. Reads the
+                                                corpus from stdin, and writes it to stdout.""")
+parser.add_argument('dev_text', type=str,
+                    help='dev transcription location.')
+parser.add_argument('test_text', type=str,
+                    help='test transcription location.')
+args = parser.parse_args()
+
+def remove_punctuations(transcript):
+    char_list = []
+    for char in transcript:
+        if char.isdigit() or char == '+' or char == '~' or char == '?':
+            continue
+        if char == '#' or char == '=' or char == '-' or char == '!':
+            continue
+        if char == ',' or char == '.' or char == ')' or char == '\'':
+            continue
+        if char == '(' or char == ':' or char == ';' or char == '"':
+            continue
+        char_list.append(char)
+    return char_list
+
+
+def remove_special_words(words):
+    word_list = []
+    for word in words:
+        if word == '<SIC>' or word == '#':
+            continue
+        word_list.append(word)
+    return word_list
+
+
+# process and add dev/eval transcript in a list
+# remove special words, punctuations, spaces between words
+# lowercase the characters
+def read_utterances(text_file_path):
+    with open(text_file_path, 'rt') as in_file:
+        for line in in_file:
+            words = line.strip().split()
+            words_wo_sw = remove_special_words(words)
+            transcript = ''.join(words_wo_sw[1:])
+            transcript = transcript.lower()
+            trans_wo_punct = remove_punctuations(transcript)
+            transcript = ''.join(trans_wo_punct)
+            utterance_dict[words_wo_sw[0]] = transcript
+
+
+### main ###
+
+# read utterances and add it to utterance_dict
+utterance_dict = dict()
+read_utterances(args.dev_text)
+read_utterances(args.test_text)
+
+# read corpus and add it to below lists
+corpus_text_lowercase_wo_sc = list()
+corpus_text_wo_sc = list()
+original_corpus_text = list()
+for line in sys.stdin:
+    original_corpus_text.append(line)
+    words = line.strip().split()
+    words_wo_sw = remove_special_words(words)
+
+    transcript = ''.join(words_wo_sw)
+    transcript = transcript.lower()
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_lowercase_wo_sc.append(transcript)
+
+    transcript = ''.join(words_wo_sw)
+    trans_wo_punct = remove_punctuations(transcript)
+    transcript = ''.join(trans_wo_punct)
+    corpus_text_wo_sc.append(transcript)
+
+# find majority of utterances below
+# for utterances which were not found
+# add them to remaining_utterances
+row_to_keep = [True for i in range(len(original_corpus_text))]
+remaining_utterances = dict()
+for line_id, line_to_find in utterance_dict.items():
+    found_line = False
+    for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)):
+        # Combine 3 consecutive lines of the corpus into a single line
+        prev_words = corpus_text_lowercase_wo_sc[i - 1].strip()
+        curr_words = corpus_text_lowercase_wo_sc[i].strip()
+        next_words = corpus_text_lowercase_wo_sc[i + 1].strip()
+        new_line = prev_words + curr_words + next_words
+        transcript = ''.join(new_line)
+        if line_to_find in transcript:
+            found_line = True
+            row_to_keep[i-1] = False
+            row_to_keep[i] = False
+            row_to_keep[i+1] = False
+    if not found_line:
+        remaining_utterances[line_id] = line_to_find
+
+
+for i in range(len(original_corpus_text)):
+    transcript = original_corpus_text[i].strip()
+    if row_to_keep[i]:
+        print(transcript)
+
+print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr)
+print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr)
+print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr)
+print('LOB lines: Before: {}   After: {}'.format(len(original_corpus_text),
+                                                 row_to_keep.count(True)), file=sys.stderr)
diff --git a/egs/iam/v1/local/score.sh b/egs/iam/v1/local/score.sh
index 31564d25326..d964d70206b 100755
--- a/egs/iam/v1/local/score.sh
+++ b/egs/iam/v1/local/score.sh
@@ -1,5 +1,157 @@
 #!/bin/bash
+# Copyright 2012-2014  Johns Hopkins University (Author: Daniel Povey, Yenda Trmal)
+# Apache 2.0
 
+# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the <unk>'s
+# using local/unk_arc_post_to_transcription.py and also it calls
+# steps/scoring/score_kaldi_cer.sh at the end.
 
-steps/scoring/score_kaldi_wer.sh "$@"
-steps/scoring/score_kaldi_cer.sh --stage 2 "$@"
+[ -f ./path.sh ] && . ./path.sh
+
+# begin configuration section.
+cmd=run.pl
+stage=0
+decode_mbr=false
+stats=true
+beam=6
+word_ins_penalty=0.0,0.5,1.0
+min_lmwt=3
+max_lmwt=13
+iter=final
+#end configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+[ -f ./path.sh ] && . ./path.sh
+. parse_options.sh || exit 1;
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>"
+  echo " Options:"
+  echo "    --cmd (run.pl|queue.pl...)      # specify how to run the sub-processes."
+  echo "    --stage (0|1|2)                 # start scoring script from part-way through."
+  echo "    --decode_mbr (true/false)       # maximum bayes risk decoding (confusion network)."
+  echo "    --min_lmwt <int>                # minumum LM-weight for lattice rescoring "
+  echo "    --max_lmwt <int>                # maximum LM-weight for lattice rescoring "
+  exit 1;
+fi
+
+data=$1
+lang_or_graph=$2
+dir=$3
+model_path=`echo $dir |xargs dirname`
+symtab=$lang_or_graph/words.txt
+
+for f in $symtab $dir/lat.1.gz $data/text; do
+  [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1;
+done
+
+
+ref_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter"
+hyp_filtering_cmd="cat"
+[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter"
+[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter"
+
+
+if $decode_mbr ; then
+  echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty"
+else
+  echo "$0: scoring with word insertion penalty=$word_ins_penalty"
+fi
+
+
+mkdir -p $dir/scoring_kaldi
+cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1;
+if [ $stage -le 0 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    mkdir -p $dir/scoring_kaldi/penalty_$wip/log
+
+    if $decode_mbr ; then
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        acwt=\`perl -e \"print 1.0/LMWT\"\`\; \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-prune --beam=$beam ark:- ark:- \| \
+        lattice-mbr-decode  --word-symbol-table=$symtab \
+        ark:- ark,t:- \| \
+        utils/int2sym.pl -f 2- $symtab \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+
+    else
+      $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \
+        lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \
+        lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \
+        lattice-1best ark:- ark:- \| \
+        lattice-align-words $lang_or_graph/phones/word_boundary.int $model_path/final.mdl ark:- ark:- \| \
+        lattice-arc-post $model_path/final.mdl ark:- - \| \
+        local/unk_arc_post_to_transcription.py $lang_or_graph/phones.txt $lang_or_graph/words.txt data/lang_unk/oov.int \| \
+        $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1;
+    fi
+
+    $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \
+      cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \
+      compute-wer --text --mode=present \
+      "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1;
+
+  done
+fi
+
+
+
+if [ $stage -le 1 ]; then
+
+  for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do
+    for lmwt in $(seq $min_lmwt $max_lmwt); do
+      # adding /dev/null to the command list below forces grep to output the filename
+      grep WER $dir/wer_${lmwt}_${wip} /dev/null
+    done
+  done | utils/best_wer.sh  >& $dir/scoring_kaldi/best_wer || exit 1
+
+  best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer)
+  best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}')
+  best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}')
+
+  if [ -z "$best_lmwt" ]; then
+    echo "$0: we could not get the details of the best WER from the file $dir/wer_*.  Probably something went wrong."
+    exit 1;
+  fi
+
+  if $stats; then
+    mkdir -p $dir/scoring_kaldi/wer_details
+    echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight
+    echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty
+
+    $cmd $dir/scoring_kaldi/log/stats1.log \
+      cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \
+      align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \|  \
+      utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\
+       utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/stats2.log \
+      cat $dir/scoring_kaldi/wer_details/per_utt \| \
+      utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \
+      sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1;
+
+    $cmd $dir/scoring_kaldi/log/wer_bootci.log \
+      compute-wer-bootci --mode=present \
+        ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \
+        '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1;
+
+  fi
+fi
+
+steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \
+                                 --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \
+                                 $data $lang_or_graph $dir
+
+# If we got here, the scoring was successful.
+# As a  small aid to prevent confusion, we remove all wer_{?,??} files;
+# these originate from the previous version of the scoring files
+# i keep both statement here because it could lead to confusion about
+# the capabilities of the script (we don't do cer in the script)
+rm $dir/wer_{?,??} 2>/dev/null
+rm $dir/cer_{?,??} 2>/dev/null
+
+exit 0;
diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index aa4303d6a28..a673c5b3f2d 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -13,6 +13,7 @@
 
 set -e
 stage=0
+vocab_size=50000
 
 echo "$0 $@"  # Print the command line for logging
 . ./utils/parse_options.sh || exit 1;
@@ -57,8 +58,10 @@ if [ $stage -le 0 ]; then
   rm ${dir}/data/text/* 2>/dev/null || true
 
   # Using LOB and brown corpus.
-  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt > ${dir}/data/text/text.txt
-  cat data/local/browncorpus/brown.txt >> ${dir}/data/text/text.txt
+  cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
+    local/remove_test_utterances_from_lob.py data/test/text data/val/text \
+                                             > ${dir}/data/text/lob.txt
+  cat data/local/browncorpus/brown.txt >> ${dir}/data/text/brown.txt
 
   # use the validation data as the dev set.
   # Note: the name 'dev' is treated specially by pocolm, it automatically
@@ -78,8 +81,8 @@ if [ $stage -le 0 ]; then
   cut -d " " -f 2-  < data/test/text  > ${dir}/data/real_dev_set.txt
 
   # get the wordlist from IAM text
-  cat ${dir}/data/text/{iam,text}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
-  cat ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
+  cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count
+  head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist
 fi
 
 order=3
@@ -91,7 +94,7 @@ if [ $stage -le 1 ]; then
   # Note: if you have more than one order, use a certain amount of words as the
   # vocab and want to restrict max memory for 'sort',
   echo "$0: training the unpruned LM"
-  min_counts='train=2 iam=1'
+  min_counts='brown=2 lob=2 iam=1'
   wordlist=${dir}/data/wordlist
 
   lm_name="`basename ${wordlist}`_${order}"
diff --git a/egs/iam/v1/local/unk_arc_post_to_transcription.py b/egs/iam/v1/local/unk_arc_post_to_transcription.py
new file mode 100755
index 00000000000..c86d35e4b8a
--- /dev/null
+++ b/egs/iam/v1/local/unk_arc_post_to_transcription.py
@@ -0,0 +1,86 @@
+#!/usr/bin/env python
+
+# Copyright     2017  Ashish Arora
+
+import argparse
+import sys
+
+parser = argparse.ArgumentParser(description="""uses phones to convert unk to word""")
+parser.add_argument('phones', type=str, help='phones and phonesID')
+parser.add_argument('words', type=str, help='word and wordID')
+parser.add_argument('unk', type=str, default='-', help='location of unk file')
+parser.add_argument('--input-ark', type=str, default='-', help='where to read the input data')
+parser.add_argument('--out-ark', type=str, default='-', help='where to write the output data')
+args = parser.parse_args()
+### main ###
+phone_fh = open(args.phones, 'r')
+word_fh = open(args.words, 'r')
+unk_fh = open(args.unk,'r')
+if args.input_ark == '-':
+    input_fh = sys.stdin
+else:
+    input_fh = open(args.input_ark,'r')
+if args.out_ark == '-':
+    out_fh = sys.stdout
+else:
+    out_fh = open(args.out_ark,'wb')
+
+phone_dict = dict()# stores phoneID and phone mapping
+phone_data_vect = phone_fh.read().strip().split("\n")
+for key_val in phone_data_vect:
+  key_val = key_val.split(" ")
+  phone_dict[key_val[1]] = key_val[0]
+word_dict = dict()
+word_data_vect = word_fh.read().strip().split("\n")
+for key_val in word_data_vect:
+  key_val = key_val.split(" ")
+  word_dict[key_val[1]] = key_val[0]
+unk_val = unk_fh.read().strip().split(" ")[0]
+
+utt_word_dict = dict()
+utt_phone_dict = dict()# stores utteranceID and phoneID
+unk_word_dict = dict()
+count=0
+for line in input_fh:
+  line_vect = line.strip().split("\t")
+  if len(line_vect) < 6:
+    print "IndexError"
+    print line_vect
+    continue
+  uttID = line_vect[0]
+  word = line_vect[4]
+  phones = line_vect[5]
+  if uttID in utt_word_dict.keys():
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  else:
+    count = 0
+    utt_word_dict[uttID] = dict()
+    utt_phone_dict[uttID] = dict()
+    utt_word_dict[uttID][count] = word
+    utt_phone_dict[uttID][count] = phones
+  if word == unk_val: # get character sequence for unk
+    phone_key_vect = phones.split(" ")
+    phone_val_vect = list()
+    for pkey in phone_key_vect:
+      phone_val_vect.append(phone_dict[pkey])
+    phone_2_word = list()
+    for phone_val in phone_val_vect:
+      phone_2_word.append(phone_val.split('_')[0])
+    phone_2_word = ''.join(phone_2_word)
+    utt_word_dict[uttID][count] = phone_2_word
+  else:
+    if word == '0':
+      word_val = ' '
+    else:
+      word_val = word_dict[word]
+    utt_word_dict[uttID][count] = word_val
+  count += 1
+
+transcription = ""
+for key in sorted(utt_word_dict.iterkeys()):
+  transcription = key
+  for index in sorted(utt_word_dict[key].iterkeys()):
+    value = utt_word_dict[key][index]
+    transcription = transcription + " " + value
+  out_fh.write(transcription + '\n')
diff --git a/egs/iam/v1/run.sh b/egs/iam/v1/run.sh
index d5f66ca4110..f5c4a2b8f80 100755
--- a/egs/iam/v1/run.sh
+++ b/egs/iam/v1/run.sh
@@ -21,7 +21,6 @@ iam_database=/export/corpora5/handwriting_ocr/IAM
 . ./utils/parse_options.sh  # e.g. this parses the above options
                             # if supplied.
 
-
 ./local/check_tools.sh
 
 if [ $stage -le 0 ]; then
@@ -42,17 +41,26 @@ if [ $stage -le 1 ]; then
 fi
 
 if [ $stage -le 2 ]; then
+  echo "$0: Estimating a language model for decoding..."
+  # We do this stage before dict preparation because prepare_dict.sh
+  # generates the lexicon from pocolm's wordlist
+  local/train_lm.sh --vocab-size 50000
+fi
+
+if [ $stage -le 3 ]; then
   echo "$0: Preparing dictionary and lang..."
   local/prepare_dict.sh
   utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.95 \
                         data/local/dict "<unk>" data/lang/temp data/lang
-fi
-
-if [ $stage -le 3 ]; then
-  echo "$0: Estimating a language model for decoding..."
-  local/train_lm.sh
   utils/format_lm.sh data/lang data/local/local_lm/data/arpa/3gram_big.arpa.gz \
                      data/local/dict/lexicon.txt data/lang_test
+  echo "$0: Preparing the unk model for open-vocab decoding..."
+  utils/lang/make_unk_lm.sh --ngram-order 4 --num-extra-ngrams 7500 \
+                            data/local/dict exp/unk_lang_model
+  utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 \
+                        --unk-fst exp/unk_lang_model/unk_fst.txt \
+                        data/local/dict "<unk>" data/local/temp data/lang_unk
+  cp data/lang_test/G.fst data/lang_unk/G.fst
 fi
 
 if [ $stage -le 4 ]; then