From 36c2f527ead957d1637c1aa8b15bc69a1a2bf24c Mon Sep 17 00:00:00 2001
From: Chun-Chieh Chang <cchunch1@jhu.edu>
Date: Mon, 1 Oct 2018 13:00:06 -0400
Subject: [PATCH 1/3] minor change to handle cases where <s> is appears in text

---
 egs/wsj/s5/utils/build_const_arpa_lm.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/egs/wsj/s5/utils/build_const_arpa_lm.sh b/egs/wsj/s5/utils/build_const_arpa_lm.sh
index ec067df0d39..51aca1bb2ad 100755
--- a/egs/wsj/s5/utils/build_const_arpa_lm.sh
+++ b/egs/wsj/s5/utils/build_const_arpa_lm.sh
@@ -34,8 +34,8 @@ mkdir -p $new_lang
 cp -r $old_lang/* $new_lang
 
 unk=`cat $new_lang/oov.int`
-bos=`grep -w "<s>" $new_lang/words.txt | awk '{print $2}'`
-eos=`grep "</s>" $new_lang/words.txt | awk '{print $2}'`
+bos=`grep "^<s>\s" $new_lang/words.txt | awk '{print $2}'`
+eos=`grep "^</s>\s" $new_lang/words.txt | awk '{print $2}'`
 if [[ -z $bos || -z $eos ]]; then
   echo "$0: <s> and </s> symbols are not in $new_lang/words.txt"
   exit 1

From a3464d01fe14faec651454c9b31f1807e871a7b4 Mon Sep 17 00:00:00 2001
From: System User <cchang@test2.cm.gemini>
Date: Wed, 16 Jan 2019 00:26:24 -0500
Subject: [PATCH 2/3] mitigating some overflow error with floats

---
 src/nnet3/convolution.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/nnet3/convolution.cc b/src/nnet3/convolution.cc
index 287ab7f47dd..1c5396949f8 100644
--- a/src/nnet3/convolution.cc
+++ b/src/nnet3/convolution.cc
@@ -976,7 +976,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // work out how many rows the temporary matrix should have, taking
     // into account the specified memory limit.
     temp_rows = computation->num_t_out * computation->num_images;
-    BaseFloat num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0,
+    BaseFloat num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0)),
         megabyte_limit = opts.max_memory_mb;
     // C++ rounds down; here, we want to round up so we add one.
     int32 ratio = 1.0 + num_megabytes / megabyte_limit;
@@ -986,7 +986,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts,
     // >= temp_rows so that we don't have a small leftover piece.
     int32 new_num_t_out = (computation->num_t_out + ratio - 1) / ratio;
     temp_rows = new_num_t_out * computation->num_images;
-    BaseFloat new_num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0;
+    BaseFloat new_num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0));
     // make sure we're within the memory limit.
     if (new_num_megabytes > 1.01 * megabyte_limit) {
       KALDI_WARN << "Memory consumed in convolution is more than requested "

From 69a1c32c947003ca8105b4dcc3fed6f7296b1a3a Mon Sep 17 00:00:00 2001
From: Chun-Chieh Chang <cchunch1@jhu.edu>
Date: Thu, 21 Feb 2019 16:16:53 -0500
Subject: [PATCH 3/3] minor bug fixes. Syntax error in
 uw3/v1/local/process_data.py and iam/v1 doesn't use bpe but
 iam/v1/local/train_lm.sh looks for a file generated by bpe

---
 egs/iam/v1/local/train_lm.sh     | 2 +-
 egs/uw3/v1/local/process_data.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh
index 911f54c5439..3e8c838efdb 100755
--- a/egs/iam/v1/local/train_lm.sh
+++ b/egs/iam/v1/local/train_lm.sh
@@ -60,7 +60,7 @@ if [ $stage -le 0 ]; then
   # Using LOB and brown corpus.
   if [ ! -f data/local/lob-train-only.txt ]; then
     cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \
-      local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \
+      local/remove_test_utterances_from_lob.py data/test/text data/val/text \
                                                > data/local/lob-train-only.txt
   fi
   cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt
diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py
index 3643c0aca89..23b8e5402cf 100755
--- a/egs/uw3/v1/local/process_data.py
+++ b/egs/uw3/v1/local/process_data.py
@@ -52,10 +52,10 @@
       # The dataset is randomly split train 95% and test 5%
       coin = random.randint(0, 20)
       if coin >= 1:
-        train_text_fh.write(utt_id + ' ' + text + '\n')
+        train_text_fh.write("{} {}\n".format(utt_id, text))
         train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))
       elif coin < 1:
         test_text_fh.write("{} {}\n".format(utt_id, text))
         test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count))
-        train_image_fh.write("{} {}\n".format(utt_id, image_path)
+        train_image_fh.write("{} {}\n".format(utt_id, image_path))