From 5f3da2d15477b77b2f22033b5419c5462e929878 Mon Sep 17 00:00:00 2001 From: Ke Li Date: Wed, 21 Feb 2018 21:30:39 -0500 Subject: [PATCH 1/3] update tedlium ppl numbers --- egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh index cc0410c3519..e8fad2a0e27 100755 --- a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh +++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh @@ -5,9 +5,9 @@ # 2017 Ke Li # rnnlm/train_rnnlm.sh: best iteration (out of 10) was 8, linking it to final iteration. -# rnnlm/train_rnnlm.sh: train/dev perplexity was 78.4 / 147.8. -# Train objf: -1556.00 -5.43 -5.15 -5.00 -4.90 -4.82 -4.75 -4.69 -4.63 -4.58 -# Dev objf: -11.92 -5.70 -5.29 -5.16 -5.08 -5.04 -5.02 -5.00 -5.00 -5.00 +# rnnlm/train_rnnlm.sh: train/dev perplexity was 100.9 / 155.7. +# Train objf: -6.19 -5.43 -5.14 -4.99 -4.88 -4.80 -4.73 -4.66 -4.60 -4.54 +# Dev objf: -11.92 -5.75 -5.33 -5.19 -5.12 -5.09 -5.06 -5.05 -5.05 -5.05 # Begin configuration section. dir=exp/rnnlm_lstm_tdnn From e0129b21696431a17b1a612b9d3d5ea295bcd0fe Mon Sep 17 00:00:00 2001 From: Ke Li Date: Thu, 1 Mar 2018 13:15:09 -0500 Subject: [PATCH 2/3] add RNNLM rescoring results; add RNNLM trained on +lm1b --- .../s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh | 98 ++++++++--- .../rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh | 156 ++++++++++++++++++ 2 files changed, 233 insertions(+), 21 deletions(-) create mode 100755 egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh index e8fad2a0e27..9887e5ee150 100755 --- a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh +++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh @@ -2,29 +2,52 @@ # Copyright 2012 Johns Hopkins University (author: Daniel Povey) Tony Robinson # 2017 Hainan Xu -# 2017 Ke Li +# 2018 Ke Li -# rnnlm/train_rnnlm.sh: best iteration (out of 10) was 8, linking it to final iteration. -# rnnlm/train_rnnlm.sh: train/dev perplexity was 100.9 / 155.7. -# Train objf: -6.19 -5.43 -5.14 -4.99 -4.88 -4.80 -4.73 -4.66 -4.60 -4.54 -# Dev objf: -11.92 -5.75 -5.33 -5.19 -5.12 -5.09 -5.06 -5.05 -5.05 -5.05 +# rnnlm/train_rnnlm.sh: best iteration (out of 9) was 8, linking it to final iteration. +# rnnlm/train_rnnlm.sh: train/dev perplexity was 94.1 / 155.1. +# Train objf: -6.24 -5.45 -5.12 -4.95 -4.84 -4.74 -4.66 -4.59 -4.52 -4.46 +# Dev objf: -11.92 -5.80 -5.32 -5.17 -5.10 -5.07 -5.05 -5.05 -5.04 -5.06 + +# 1-pass results +# %WER 8.3 | 1155 27500 | 92.7 4.9 2.4 1.0 8.3 68.8 | -0.019 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test/score_10_0.0/ctm.filt.filt.sys + +# 4-gram rescoring +# %WER 7.8 | 1155 27500 | 93.1 4.5 2.4 0.9 7.8 66.4 | -0.089 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test_rescore/score_10_0.0/ctm.filt.filt.sys + +# RNNLM lattice rescoring +# %WER 7.4 | 1155 27500 | 93.4 4.2 2.4 0.8 7.4 65.0 | -0.117 | exp/decode_test_rnnlm_tedlium/score_10_0.0/ctm.filt.filt.sys + +# RNNLM nbest rescoring +# %WER 7.7 | 1155 27500 | 93.1 4.3 2.6 0.8 7.7 65.8 | -0.855 | exp/decode_test_rnnlm_tedlium_nbest//score_10_0.0/ctm.filt.filt.sys # Begin configuration section. +cmd=run.pl +decode_cmd=run.pl dir=exp/rnnlm_lstm_tdnn -embedding_dim=800 -lstm_rpd=200 -lstm_nrpd=200 -stage=-10 +embedding_dim=1024 +lstm_rpd=256 +lstm_nrpd=256 +stage=0 train_stage=-10 epochs=20 -. ./cmd.sh -. utils/parse_options.sh -[ -z "$cmd" ] && cmd=$train_cmd +# variables for lattice rescoring +run_lat_rescore=true +run_nbest_rescore=true +decode_dir_suffix=rnnlm_tedlium +ac_model_dir=/export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi +ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order + # if it's set, it merges histories in the lattice if they share + # the same ngram history and this prevents the lattice from + # exploding exponentially +pruned_rescore=true +. ./cmd.sh +. ./utils/parse_options.sh -text=data/train/text wordlist=data/lang/words.txt +text=data/train/text dev_sents=10000 text_dir=data/rnnlm/text mkdir -p $dir/config @@ -37,14 +60,15 @@ done if [ $stage -le 0 ]; then mkdir -p $text_dir - cat $text | cut -d ' ' -f2- | head -n $dev_sents> $text_dir/dev.txt + cat $text | cut -d ' ' -f2- | head -n $dev_sents > $text_dir/dev.txt cat $text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt fi + if [ $stage -le 1 ]; then cp $wordlist $dir/config/ - n=`cat $dir/config/words.txt | wc -l` - echo " $n" >> $dir/config/words.txt + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt # words that are not present in words.txt but are in the training or dev data, will be # mapped to during training. @@ -66,8 +90,9 @@ EOF --min-frequency 1.0e-03 \ --special-words=',,,' \ $dir/config/words.txt > $dir/config/features.txt +fi - cat >$dir/config/xconfig <$dir/config/xconfig < $text_dir/dev.txt + cat $train_text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt + cp $text $text_dir/lm1b.txt +fi + + +if [ $stage -le 1 ]; then + cp $wordlist $dir/config/ + n=`cat $dir/config/words.txt | wc -l` + echo " $n" >> $dir/config/words.txt + + # words that are not present in words.txt but are in the training or dev data, will be + # mapped to during training. + echo "" >$dir/config/oov.txt + + cat > $dir/config/data_weights.txt <$dir/config/unigram_probs.txt + + # choose features + rnnlm/choose_features.py --unigram-probs=$dir/config/unigram_probs.txt \ + --use-constant-feature=true \ + --top-word-features=10000 \ + --min-frequency 1.0e-03 \ + --special-words=',,,' \ + $dir/config/words.txt > $dir/config/features.txt +fi + +cat >$dir/config/xconfig < Date: Thu, 20 Sep 2018 14:17:28 -0400 Subject: [PATCH 3/3] update tedlium rnnlm results --- .../s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh | 20 +++--- .../rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh | 66 +++++++++++++------ 2 files changed, 56 insertions(+), 30 deletions(-) diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh index 9887e5ee150..87f99f651bf 100755 --- a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh +++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn.sh @@ -16,10 +16,10 @@ # %WER 7.8 | 1155 27500 | 93.1 4.5 2.4 0.9 7.8 66.4 | -0.089 | /export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi/decode_looped_test_rescore/score_10_0.0/ctm.filt.filt.sys # RNNLM lattice rescoring -# %WER 7.4 | 1155 27500 | 93.4 4.2 2.4 0.8 7.4 65.0 | -0.117 | exp/decode_test_rnnlm_tedlium/score_10_0.0/ctm.filt.filt.sys +# %WER 7.2 | 1155 27500 | 93.6 4.0 2.3 0.8 7.2 64.3 | -0.927 | exp/decode_looped_test_rnnlm_tedlium_rescore//score_10_0.0/ctm.filt.filt.sys # RNNLM nbest rescoring -# %WER 7.7 | 1155 27500 | 93.1 4.3 2.6 0.8 7.7 65.8 | -0.855 | exp/decode_test_rnnlm_tedlium_nbest//score_10_0.0/ctm.filt.filt.sys +# %WER 7.4 | 1155 27500 | 93.4 4.3 2.3 0.9 7.4 64.8 | -0.863 | exp/decode_looped_test_rnnlm_tedlium_nbest_rescore/score_8_0.0/ctm.filt.filt.sys # Begin configuration section. cmd=run.pl @@ -36,7 +36,7 @@ epochs=20 run_lat_rescore=true run_nbest_rescore=true decode_dir_suffix=rnnlm_tedlium -ac_model_dir=/export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi +ac_model_dir=exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order # if it's set, it merges histories in the lattice if they share # the same ngram history and this prevents the lattice from @@ -64,7 +64,6 @@ if [ $stage -le 0 ]; then cat $text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt fi - if [ $stage -le 1 ]; then cp $wordlist $dir/config/ n=`cat $dir/config/words.txt | wc -l` @@ -113,7 +112,8 @@ fi if [ $stage -le 3 ]; then rnnlm/train_rnnlm.sh --num-jobs-initial 1 --num-jobs-final 1 \ - --stage $train_stage --num-epochs $epochs --cmd "queue.pl" $dir + --stage $train_stage --num-epochs $epochs \ + --cmd "queue.pl" $dir fi if [ $stage -le 4 ] && $run_lat_rescore; then @@ -123,7 +123,7 @@ if [ $stage -le 4 ] && $run_lat_rescore; then pruned=_pruned fi for decode_set in dev test; do - decode_dir=${ac_model_dir}/decode_looped_${decode_set} + decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore # Lattice rescoring rnnlm/lmrescore$pruned.sh \ @@ -131,21 +131,21 @@ if [ $stage -le 4 ] && $run_lat_rescore; then --weight 0.5 --max-ngram-order $ngram_order \ data/lang $dir \ data/${decode_set}_hires ${decode_dir} \ - exp/decode_${decode_set}_${decode_dir_suffix} + exp/decode_looped_${decode_set}_${decode_dir_suffix}_rescore done fi if [ $stage -le 5 ] && $run_nbest_rescore; then echo "$0: Perform nbest-rescoring on $ac_model_dir" for decode_set in dev test; do - decode_dir=${ac_model_dir}/decode_looped_${decode_set} + decode_dir=${ac_model_dir}/decode_looped_${decode_set}_rescore - # Lattice rescoring + # nbest rescoring rnnlm/lmrescore_nbest.sh \ --cmd "$decode_cmd --mem 4G" --N 20 \ 0.8 data/lang $dir \ data/${decode_set}_hires ${decode_dir} \ - exp/decode_${decode_set}_${decode_dir_suffix}_nbest + exp/decode_looped_${decode_set}_${decode_dir_suffix}_nbest_rescore done fi diff --git a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh index 56ab10f8425..a0b16dea890 100755 --- a/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh +++ b/egs/tedlium/s5_r2/local/rnnlm/tuning/run_lstm_tdnn_with_lm1b.sh @@ -23,19 +23,19 @@ # Begin configuration section. cmd=run.pl decode_cmd=run.pl -dir=exp/rnnlm_lm1b_tedlium_weight3 +dir=exp/rnnlm_lstm_tdnn_with_lm1b embedding_dim=1024 lstm_rpd=256 lstm_nrpd=256 stage=0 train_stage=-10 -epochs=20 +epochs=3 # variables for lattice rescoring run_lat_rescore=true run_nbest_rescore=true -decode_dir_suffix=rnnlm_lm1b_tedlium_weight3 -ac_model_dir=/export/a12/ywang/kaldi/egs/tedlium/s5_r2/exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi +decode_dir_suffix=rnnlm_lstm_tdnn_with_lm1b +ac_model_dir=exp/chain_cleaned/tdnn_lstm1i_adversarial1.0_interval4_epoches7_lin_to_5_sp_bi ngram_order=4 # approximate the lattice-rescoring by limiting the max-ngram-order # if it's set, it merges histories in the lattice if they share # the same ngram history and this prevents the lattice from @@ -45,7 +45,7 @@ pruned_rescore=true . ./cmd.sh . ./utils/parse_options.sh -text=/export/a07/keli1/1-billion-word-language-modeling-benchmark/training-monolingual.tokenized.shuffled/news.en +lm1b_dir=data/rnnlm/lm1b wordlist=data/lang/words.txt train_text=data/train/text dev_sents=10000 @@ -53,21 +53,46 @@ text_dir=data/rnnlm/text_lm1b_tedlium mkdir -p $dir/config set -e -for f in $text $wordlist $train_text; do +for f in $wordlist $train_text; do [ ! -f $f ] && \ echo "$0: expected file $f to exist; generate lm1b data first; \ search for local/prepare_data.sh and utils/prepare_lang.sh in run.sh" && exit 1 done if [ $stage -le 0 ]; then + mkdir -p $lm1b_dir + cd $lm1b_dir + if [ ! -f training-monolingual.tgz ]; then + wget http://statmt.org/wmt11/training-monolingual.tgz . + fi + echo "Downloaded google one billion dataset." + + if [ ! -d training-monolingual ]; then + tar --extract -v --file training-monolingual.tgz --wildcards training-monolingual/news.20??.en.shuffled + fi + echo "Untar google one billion dataset." + + for year in 2007 2008 2009 2010 2011; do + cat training-monolingual/news.${year}.en.shuffled + done | sort -u --output=training-monolingual/news.20XX.en.shuffled.sorted + echo "Done sorting corpus." + + time cat training-monolingual/news.20XX.en.shuffled.sorted | \ + ../../../utils/normalize_punctuation.pl -l en -q 1 | \ + ../../../utils/tokenizer.pl -l en -q 1 > \ + training-monolingual/news.20XX.en.shuffled.sorted.tokenized + echo "Done tokenizing corpus." + cd ../../.. +fi + +if [ $stage -le 1 ]; then mkdir -p $text_dir cat $train_text | cut -d ' ' -f2- | head -n $dev_sents > $text_dir/dev.txt cat $train_text | cut -d ' ' -f2- | tail -n +$[$dev_sents+1] > $text_dir/ted.txt - cp $text $text_dir/lm1b.txt + cp $lm1b_dir/training-monolingual/news.20XX.en.shuffled.sorted.tokenized $text_dir/lm1b.txt fi - -if [ $stage -le 1 ]; then +if [ $stage -le 2 ]; then cp $wordlist $dir/config/ n=`cat $dir/config/words.txt | wc -l` echo " $n" >> $dir/config/words.txt @@ -95,7 +120,7 @@ EOF $dir/config/words.txt > $dir/config/features.txt fi -cat >$dir/config/xconfig <$dir/config/xconfig <