From 36c2f527ead957d1637c1aa8b15bc69a1a2bf24c Mon Sep 17 00:00:00 2001 From: Chun-Chieh Chang Date: Mon, 1 Oct 2018 13:00:06 -0400 Subject: [PATCH 1/3] minor change to handle cases where is appears in text --- egs/wsj/s5/utils/build_const_arpa_lm.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/wsj/s5/utils/build_const_arpa_lm.sh b/egs/wsj/s5/utils/build_const_arpa_lm.sh index ec067df0d39..51aca1bb2ad 100755 --- a/egs/wsj/s5/utils/build_const_arpa_lm.sh +++ b/egs/wsj/s5/utils/build_const_arpa_lm.sh @@ -34,8 +34,8 @@ mkdir -p $new_lang cp -r $old_lang/* $new_lang unk=`cat $new_lang/oov.int` -bos=`grep -w "" $new_lang/words.txt | awk '{print $2}'` -eos=`grep "" $new_lang/words.txt | awk '{print $2}'` +bos=`grep "^\s" $new_lang/words.txt | awk '{print $2}'` +eos=`grep "^\s" $new_lang/words.txt | awk '{print $2}'` if [[ -z $bos || -z $eos ]]; then echo "$0: and symbols are not in $new_lang/words.txt" exit 1 From a3464d01fe14faec651454c9b31f1807e871a7b4 Mon Sep 17 00:00:00 2001 From: System User Date: Wed, 16 Jan 2019 00:26:24 -0500 Subject: [PATCH 2/3] mitigating some overflow error with floats --- src/nnet3/convolution.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nnet3/convolution.cc b/src/nnet3/convolution.cc index 287ab7f47dd..1c5396949f8 100644 --- a/src/nnet3/convolution.cc +++ b/src/nnet3/convolution.cc @@ -976,7 +976,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts, // work out how many rows the temporary matrix should have, taking // into account the specified memory limit. temp_rows = computation->num_t_out * computation->num_images; - BaseFloat num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0, + BaseFloat num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0)), megabyte_limit = opts.max_memory_mb; // C++ rounds down; here, we want to round up so we add one. int32 ratio = 1.0 + num_megabytes / megabyte_limit; @@ -986,7 +986,7 @@ static void ComputeTempMatrixSize(const ConvolutionComputationOptions &opts, // >= temp_rows so that we don't have a small leftover piece. int32 new_num_t_out = (computation->num_t_out + ratio - 1) / ratio; temp_rows = new_num_t_out * computation->num_images; - BaseFloat new_num_megabytes = (4 * temp_rows * temp_cols) / 1000000.0; + BaseFloat new_num_megabytes = (4 * (temp_rows / 1000.0) * (temp_cols / 1000.0)); // make sure we're within the memory limit. if (new_num_megabytes > 1.01 * megabyte_limit) { KALDI_WARN << "Memory consumed in convolution is more than requested " From 69a1c32c947003ca8105b4dcc3fed6f7296b1a3a Mon Sep 17 00:00:00 2001 From: Chun-Chieh Chang Date: Thu, 21 Feb 2019 16:16:53 -0500 Subject: [PATCH 3/3] minor bug fixes. Syntax error in uw3/v1/local/process_data.py and iam/v1 doesn't use bpe but iam/v1/local/train_lm.sh looks for a file generated by bpe --- egs/iam/v1/local/train_lm.sh | 2 +- egs/uw3/v1/local/process_data.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/iam/v1/local/train_lm.sh b/egs/iam/v1/local/train_lm.sh index 911f54c5439..3e8c838efdb 100755 --- a/egs/iam/v1/local/train_lm.sh +++ b/egs/iam/v1/local/train_lm.sh @@ -60,7 +60,7 @@ if [ $stage -le 0 ]; then # Using LOB and brown corpus. if [ ! -f data/local/lob-train-only.txt ]; then cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ - local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \ + local/remove_test_utterances_from_lob.py data/test/text data/val/text \ > data/local/lob-train-only.txt fi cat data/local/lob-train-only.txt > ${dir}/data/text/lob.txt diff --git a/egs/uw3/v1/local/process_data.py b/egs/uw3/v1/local/process_data.py index 3643c0aca89..23b8e5402cf 100755 --- a/egs/uw3/v1/local/process_data.py +++ b/egs/uw3/v1/local/process_data.py @@ -52,10 +52,10 @@ # The dataset is randomly split train 95% and test 5% coin = random.randint(0, 20) if coin >= 1: - train_text_fh.write(utt_id + ' ' + text + '\n') + train_text_fh.write("{} {}\n".format(utt_id, text)) train_utt2spk_fh.write("{} {}\n".format(utt_id, page_count)) - train_image_fh.write("{} {}\n".format(utt_id, image_path) + train_image_fh.write("{} {}\n".format(utt_id, image_path)) elif coin < 1: test_text_fh.write("{} {}\n".format(utt_id, text)) test_utt2spk_fh.write("{} {}\n".format(utt_id, page_count)) - train_image_fh.write("{} {}\n".format(utt_id, image_path) + train_image_fh.write("{} {}\n".format(utt_id, image_path))