diff --git a/egs/aishell/s5/local/aishell_train_lms.sh b/egs/aishell/s5/local/aishell_train_lms.sh index ea72614689d..9b6cdad2960 100755 --- a/egs/aishell/s5/local/aishell_train_lms.sh +++ b/egs/aishell/s5/local/aishell_train_lms.sh @@ -23,7 +23,7 @@ kaldi_lm=`which train_lm.sh` if [ -z $kaldi_lm ]; then echo "$0: train_lm.sh is not found. That might mean it's not installed" echo "$0: or it is not added to PATH" - echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" exit 1 fi diff --git a/egs/aishell2/s5/local/train_lms.sh b/egs/aishell2/s5/local/train_lms.sh index fbe95d898a1..179a7b78e14 100755 --- a/egs/aishell2/s5/local/train_lms.sh +++ b/egs/aishell2/s5/local/train_lms.sh @@ -24,7 +24,7 @@ kaldi_lm=`which train_lm.sh` if [ -z $kaldi_lm ]; then echo "$0: train_lm.sh is not found. That might mean it's not installed" echo "$0: or it is not added to PATH" - echo "$0: Use the script tools/extra/install_kaldi_lm.sh to install it" + echo "$0: Use the script tools/extras/install_kaldi_lm.sh to install it" exit 1 fi diff --git a/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl new file mode 100755 index 00000000000..71b26b55de5 --- /dev/null +++ b/egs/callhome_diarization/v1/local/make_swbd2_phase1.pl @@ -0,0 +1,106 @@ +#!/usr/bin/perl +use warnings; #sed replacement for -w perl parameter +# +# Copyright 2017 David Snyder +# Apache 2.0 + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/corpora3/LDC/LDC98S75 data/swbd2_phase1_train\n"; + exit(1); +} +($db_base, $out_dir) = @ARGV; + +if (system("mkdir -p $out_dir")) { + die "Error making directory $out_dir"; +} + +open(CS, "<$db_base/doc/callstat.tbl") || die "Could not open $db_base/doc/callstat.tbl"; +open(GNDR, ">$out_dir/spk2gender") || die "Could not open the output file $out_dir/spk2gender"; +open(SPKR, ">$out_dir/utt2spk") || die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">$out_dir/wav.scp") || die "Could not open the output file $out_dir/wav.scp"; + +@badAudio = ("3", "4"); + +$tmp_dir = "$out_dir/tmp"; +if (system("mkdir -p $tmp_dir") != 0) { + die "Error making directory $tmp_dir"; +} + +if (system("find $db_base -name '*.sph' > $tmp_dir/sph.list") != 0) { + die "Error getting list of sph files"; +} + +open(WAVLIST, "<$tmp_dir/sph.list") or die "cannot open wav list"; + +%wavs = (); +while() { + chomp; + $sph = $_; + @t = split("/",$sph); + @t1 = split("[./]",$t[$#t]); + $uttId = $t1[0]; + $wavs{$uttId} = $sph; +} + +while () { + $line = $_ ; + @A = split(",", $line); + @A1 = split("[./]",$A[0]); + $wav = $A1[0]; + if (/$wav/i ~~ @badAudio) { + # do nothing + print "Bad Audio = $wav"; + } else { + $spkr1= "sw_" . $A[2]; + $spkr2= "sw_" . $A[3]; + $gender1 = $A[5]; + $gender2 = $A[6]; + if ($gender1 eq "M") { + $gender1 = "m"; + } elsif ($gender1 eq "F") { + $gender1 = "f"; + } else { + die "Unknown Gender in $line"; + } + if ($gender2 eq "M") { + $gender2 = "m"; + } elsif ($gender2 eq "F") { + $gender2 = "f"; + } else { + die "Unknown Gender in $line"; + } + if (-e "$wavs{$wav}") { + $uttId = $spkr1 ."_" . $wav ."_1"; + if (!$spk2gender{$spkr1}) { + $spk2gender{$spkr1} = $gender1; + print GNDR "$spkr1"," $gender1\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 1 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr1","\n"; + + $uttId = $spkr2 . "_" . $wav ."_2"; + if (!$spk2gender{$spkr2}) { + $spk2gender{$spkr2} = $gender2; + print GNDR "$spkr2"," $gender2\n"; + } + print WAV "$uttId"," sph2pipe -f wav -p -c 2 $wavs{$wav} |\n"; + print SPKR "$uttId"," $spkr2","\n"; + } else { + print STDERR "Missing $wavs{$wav} for $wav\n"; + } + } +} + +close(WAV) || die; +close(SPKR) || die; +close(GNDR) || die; +if (system("utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +if (system("utils/fix_data_dir.sh $out_dir") != 0) { + die "Error fixing data dir $out_dir"; +} +if (system("utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/heroico/s5/RESULTS b/egs/heroico/s5/RESULTS index 9717e95e6e2..7942c03b1d9 100644 --- a/egs/heroico/s5/RESULTS +++ b/egs/heroico/s5/RESULTS @@ -1,22 +1,48 @@ # for dir in $(echo exp/tri*/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done -%WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0 -%WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0 -%WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0 -%WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0 -%WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0 -%WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0 -%WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0 -%WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0 -%WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0 -%WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0 -%WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0 -%WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0 -%WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0 -%WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 -%WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0 -%WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0 -%WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0 -%WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 -%WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0 -%WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 +# old results before adding Movie subtitles text corpus in LM training: +# %WER 67.01 [ 5126 / 7650, 837 ins, 575 del, 3714 sub ] exp/tri1/decode_devtest/wer_14_1.0 +# %WER 62.39 [ 4678 / 7498, 768 ins, 397 del, 3513 sub ] exp/tri1/decode_native/wer_13_1.0 +# %WER 67.05 [ 6179 / 9215, 895 ins, 606 del, 4678 sub ] exp/tri1/decode_nonnative/wer_13_1.0 +# %WER 64.97 [ 10859 / 16713, 1678 ins, 999 del, 8182 sub ] exp/tri1/decode_test/wer_13_1.0 +# %WER 65.90 [ 5041 / 7650, 1016 ins, 416 del, 3609 sub ] exp/tri2b/decode_devtest/wer_12_1.0 +# %WER 61.26 [ 4593 / 7498, 908 ins, 300 del, 3385 sub ] exp/tri2b/decode_native/wer_14_1.0 +# %WER 67.51 [ 6221 / 9215, 1085 ins, 524 del, 4612 sub ] exp/tri2b/decode_nonnative/wer_14_1.0 +# %WER 64.87 [ 10842 / 16713, 2004 ins, 838 del, 8000 sub ] exp/tri2b/decode_test/wer_14_1.0 +# %WER 66.09 [ 5056 / 7650, 1078 ins, 402 del, 3576 sub ] exp/tri3b/decode_devtest/wer_16_1.0 +# %WER 74.88 [ 5728 / 7650, 1210 ins, 426 del, 4092 sub ] exp/tri3b/decode_devtest.si/wer_15_1.0 +# %WER 61.19 [ 4588 / 7498, 1038 ins, 255 del, 3295 sub ] exp/tri3b/decode_native/wer_14_1.0 +# %WER 70.99 [ 5323 / 7498, 1185 ins, 301 del, 3837 sub ] exp/tri3b/decode_native.si/wer_16_1.0 +# %WER 66.35 [ 6114 / 9215, 1186 ins, 421 del, 4507 sub ] exp/tri3b/decode_nonnative/wer_17_1.0 +# %WER 76.36 [ 7037 / 9215, 1420 ins, 467 del, 5150 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 +# %WER 64.06 [ 10706 / 16713, 2245 ins, 657 del, 7804 sub ] exp/tri3b/decode_test/wer_15_1.0 +# %WER 73.97 [ 12362 / 16713, 2608 ins, 766 del, 8988 sub ] exp/tri3b/decode_test.si/wer_16_1.0 +# %WER 53.07 [ 4060 / 7650, 744 ins, 376 del, 2940 sub ] exp/chain/tdnn1e_sp/decode_devtest/wer_7_1.0 +# %WER 54.47 [ 4084 / 7498, 536 ins, 475 del, 3073 sub ] exp/chain/tdnn1e_sp/decode_native/wer_7_1.0 +# %WER 63.01 [ 5806 / 9215, 685 ins, 784 del, 4337 sub ] exp/chain/tdnn1e_sp/decode_nonnative/wer_7_1.0 +# %WER 59.25 [ 9903 / 16713, 1226 ins, 1259 del, 7418 sub ] exp/chain/tdnn1e_sp/decode_test/wer_7_1.0 + +# new results: +%WER 18.27 [ 1398 / 7650, 213 ins, 253 del, 932 sub ] exp/tri1/decode_devtest/wer_15_0.5 +%WER 9.95 [ 746 / 7498, 74 ins, 108 del, 564 sub ] exp/tri1/decode_native/wer_13_0.5 +%WER 16.63 [ 1532 / 9215, 197 ins, 183 del, 1152 sub ] exp/tri1/decode_nonnative/wer_17_0.0 +%WER 13.68 [ 2287 / 16713, 207 ins, 360 del, 1720 sub ] exp/tri1/decode_test/wer_17_0.5 +%WER 17.19 [ 1315 / 7650, 227 ins, 231 del, 857 sub ] exp/tri2b/decode_devtest/wer_17_0.5 +%WER 9.23 [ 692 / 7498, 60 ins, 103 del, 529 sub ] exp/tri2b/decode_native/wer_16_0.5 +%WER 17.16 [ 1581 / 9215, 184 ins, 216 del, 1181 sub ] exp/tri2b/decode_nonnative/wer_17_0.5 +%WER 13.64 [ 2279 / 16713, 241 ins, 326 del, 1712 sub ] exp/tri2b/decode_test/wer_17_0.5 +%WER 15.36 [ 1175 / 7650, 212 ins, 210 del, 753 sub ] exp/tri3b/decode_devtest/wer_17_0.5 +%WER 20.27 [ 1551 / 7650, 269 ins, 257 del, 1025 sub ] exp/tri3b/decode_devtest.si/wer_14_1.0 +%WER 6.40 [ 480 / 7498, 50 ins, 58 del, 372 sub ] exp/tri3b/decode_native/wer_16_0.0 +%WER 10.91 [ 818 / 7498, 100 ins, 112 del, 606 sub ] exp/tri3b/decode_native.si/wer_16_1.0 +%WER 14.30 [ 1318 / 9215, 206 ins, 134 del, 978 sub ] exp/tri3b/decode_nonnative/wer_17_0.0 +%WER 21.62 [ 1992 / 9215, 286 ins, 224 del, 1482 sub ] exp/tri3b/decode_nonnative.si/wer_16_1.0 +%WER 10.78 [ 1802 / 16713, 247 ins, 195 del, 1360 sub ] exp/tri3b/decode_test/wer_17_0.0 +%WER 16.81 [ 2809 / 16713, 374 ins, 338 del, 2097 sub ] exp/tri3b/decode_test.si/wer_16_1.0 + +# chain model results: +# for dir in $(echo exp/chain/tdnn1b_sp/decode* | grep -v 'si/'); do grep WER $dir/wer* | utils/best_wer.sh; done +%WER 12.99 [ 994 / 7650, 192 ins, 163 del, 639 sub ] exp/chain/tdnn1b_sp/decode_devtest/wer_10_1.0 +%WER 12.47 [ 1149 / 9215, 119 ins, 174 del, 856 sub ] exp/chain/tdnn1b_sp/decode_nonnative/wer_12_0.0 +%WER 9.64 [ 1611 / 16713, 169 ins, 240 del, 1202 sub ] exp/chain/tdnn1b_sp/decode_test/wer_12_0.0 +%WER 6.13 [ 460 / 7498, 52 ins, 55 del, 353 sub ] exp/chain/tdnn1b_sp/decode_native/wer_10_0.0 diff --git a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh similarity index 91% rename from egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh rename to egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh index ef4824bf7f2..1112f0ec08b 100755 --- a/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a10.sh +++ b/egs/heroico/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -1,13 +1,11 @@ #!/bin/bash - -# run_cnn_tdnn_1a10.sh is modified from run_tdnn_1b.sh but taking +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1b.sh but taking # the xconfig from mini-librispeech's run_cnn_tdnn_1a54.sh; only # reducing the bottleneck-dim from 96 to 64, which is the value -# the run_tdnn1b.sh script here has. -# Better! -# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a10_sp -# System tdnn1a_sp tdnn1b_sp cnn_tdnn1a10_sp +# the run_tdnn1b.sh script here has. Results are better. +# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1a_sp tdnn1b_sp cnn_tdnn1a_sp # %WER devtest 53.07 52.54 51.10 # %WER test 59.25 53.70 52.07 # %WER native 54.47 48.76 47.88 @@ -18,27 +16,6 @@ # Final valid prob (xent) -1.0719 -1.0849 -0.9915 # Num-params 6567648 3321312 3345088 - - -# 1b is as 1a but a re-tuned model with quite a few changes, including moving to -# a resnet-style factored TDNN-F model. -# -# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp -# System tdnn1a_sp tdnn1b_sp -# %WER devtest 53.07 52.54 -# %WER test 59.25 53.70 -# %WER native 54.47 48.76 -# %WER nonnative 63.01 57.66 -# Final train prob -0.0253 -0.0547 -# Final valid prob -0.0687 -0.0694 -# Final train prob (xent) -0.7715 -0.9502 -# Final valid prob (xent) -1.0719 -1.0849 -# Num-params 6567648 3321312 - - -# steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp -# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069) - # Set -e here so that we catch if any executable fails immediately set -euo pipefail @@ -53,7 +30,7 @@ nnet3_affix= # The rest are configs specific to this script. Most of the parameters # are just hardcoded at this level, in the commands below. -affix=1a10 # affix for the TDNN directory name +affix=1a # affix for the TDNN directory name tree_affix= train_stage=-10 get_egs_stage=-10 diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh index 4658f4d3d6d..6dde42bef79 100755 --- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh +++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1a.sh @@ -1,19 +1,20 @@ #!/bin/bash # local/chain/compare_wer.sh exp/chain/tdnn1a_sp +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp # System tdnn1a_sp -# %WER devtest 53.07 -# %WER test 59.25 -# %WER native 54.47 -# %WER nonnative 63.01 -# Final train prob -0.0253 -# Final valid prob -0.0687 -# Final train prob (xent) -0.7715 -# Final valid prob (xent) -1.0719 -# Num-params 6567648 +# %WER devtest 13.10 +# %WER test 15.53 +# %WER native 10.14 +# %WER nonnative 19.78 +# Final train prob -0.0233 +# Final valid prob -0.0720 +# Final train prob (xent) -0.8107 +# Final valid prob (xent) -0.9898 +# Num-params 6559440 # steps/info/chain_dir_info.pl exp/chain/tdnn1a_sp/ -#exp/chain/tdnn1a_sp/: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1392 combine=-0.040->-0.033 (over 7) xent:train/valid[69,104,final]=(-1.12,-0.880,-0.771/-1.33,-1.21,-1.07) logprob:train/valid[69,104,final]=(-0.050,-0.031,-0.025/-0.079,-0.080,-0.069) +# exp/chain/tdnn1a_sp: num-iters=105 nj=1..1 num-params=6.6M dim=40+100->1384 combine=-0.032->-0.026 (over 7) xent:train/valid[69,104,final]=(-1.14,-0.892,-0.811/-1.19,-1.07,-0.990) logprob:train/valid[69,104,final]=(-0.045,-0.029,-0.023/-0.083,-0.080,-0.072) # Set -e here so that we catch if any executable fails immediately set -euo pipefail diff --git a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh index 33ce1556d29..d255d85327f 100755 --- a/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh +++ b/egs/heroico/s5/local/chain/tuning/run_tdnn_1b.sh @@ -3,21 +3,20 @@ # 1b is as 1a but a re-tuned model with quite a few changes, including moving to # a resnet-style factored TDNN-F model. # -# local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp +# ./local/chain/compare_wer.sh exp/chain/tdnn1a_sp exp/chain/tdnn1b_sp # System tdnn1a_sp tdnn1b_sp -# %WER devtest 53.07 52.54 -# %WER test 59.25 53.70 -# %WER native 54.47 48.76 -# %WER nonnative 63.01 57.66 -# Final train prob -0.0253 -0.0547 -# Final valid prob -0.0687 -0.0694 -# Final train prob (xent) -0.7715 -0.9502 -# Final valid prob (xent) -1.0719 -1.0849 -# Num-params 6567648 3321312 - +# %WER devtest 13.10 12.99 +# %WER test 15.53 9.64 +# %WER native 10.14 6.13 +# %WER nonnative 19.78 12.47 +# Final train prob -0.0233 -0.0442 +# Final valid prob -0.0720 -0.0726 +# Final train prob (xent) -0.8107 -0.9759 +# Final valid prob (xent) -0.9898 -0.9964 +# Num-params 6559440 3318224 # steps/info/chain_dir_info.pl exp/chain/tdnn1b_sp -# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1392 combine=-0.059->-0.059 (over 1) xent:train/valid[21,33,final]=(-1.28,-0.986,-0.950/-1.38,-1.10,-1.08) logprob:train/valid[21,33,final]=(-0.085,-0.063,-0.055/-0.090,-0.074,-0.069) +# exp/chain/tdnn1b_sp: num-iters=34 nj=2..5 num-params=3.3M dim=40+100->1384 combine=-0.044->-0.044 (over 1) xent:train/valid[21,33,final]=(-1.30,-0.993,-0.976/-1.28,-1.01,-0.996) logprob:train/valid[21,33,final]=(-0.071,-0.050,-0.044/-0.093,-0.076,-0.073) # Set -e here so that we catch if any executable fails immediately set -euo pipefail diff --git a/egs/heroico/s5/local/heroico_answers_make_lists.pl b/egs/heroico/s5/local/heroico_answers_make_lists.pl index fb3c0ecb8d1..c1a3735b4f1 100755 --- a/egs/heroico/s5/local/heroico_answers_make_lists.pl +++ b/egs/heroico/s5/local/heroico_answers_make_lists.pl @@ -30,7 +30,7 @@ my $t = "$tmpdir/answers/text"; # initialize hash for prompts -my %p = (); +my %prompts = (); # store prompts in hash LINEA: while ( my $line = <> ) { @@ -40,9 +40,27 @@ my @dirs = split /\//, $directories; # get the speaker number my $s = $dirs[-1]; + # pad the speaker number with zeroes + my $spk = ""; + if ( $s < 10 ) { + $spk = '000' . $s; + } elsif ( $s < 100 ) { + $spk = '00' . $s; + } elsif ( $s < 1000 ) { + $spk = '0' . $s; + } + # pad the filename with zeroes + my $fn = ""; + if ( $file < 10 ) { + $fn = '000' . $file; + } elsif ( $file < 100 ) { + $fn = '00' . $file; + } elsif ( $file < 1000 ) { + $fn = '0' . $file; + } # the utterance name - my $i = $s . '_' . 'a' . '_' . $file; - $p{$i} = $sent; + my $utt = $spk . '_' . $fn; + $prompts{$utt} = $sent; } open my $W, '<', $w or croak "problem with $w $!"; @@ -58,18 +76,36 @@ my @dirs = split /\//, $directories; my $r = basename $line, ".wav"; my $s = $dirs[-1]; - my $rid = $s . '_' . 'a' . '_' . $r; - if ( exists $p{$rid} ) { - print $T "$rid $p{$rid}\n"; - } elsif ( defined $rid ) { - warn "warning: problem\t$rid"; + my $spk = ""; + # pad with zeroes + if ( $s < 10 ) { + $spk = '000' . $s; + } elsif ( $s < 100 ) { + $spk = '00' . $s; + } elsif ( $s < 1000 ) { + $spk = '0' . $s; + } + # pad the file name with zeroes + my $rec = ""; + if ( $r < 10 ) { + $rec = '000' . $r; + } elsif ( $r < 100 ) { + $rec = '00' . $r; + } elsif ( $r < 1000 ) { + $rec = '0' . $r; + } + my $rec_id = $spk . '_' . $rec; + if ( exists $prompts{$rec_id} ) { + print $T "$rec_id $prompts{$rec_id}\n"; + } elsif ( defined $rec_id ) { + warn "warning: problem\t$rec_id"; next LINE; } else { croak "$line"; } - print $O "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $U "$rid ${s}_a\n"; + print $O "$rec_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $U "$rec_id $spk\n"; } close $T; close $O; diff --git a/egs/heroico/s5/local/heroico_download.sh b/egs/heroico/s5/local/heroico_download.sh new file mode 100755 index 00000000000..9c58fe37537 --- /dev/null +++ b/egs/heroico/s5/local/heroico_download.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Copyright 2018 John Morgan +# Apache 2.0. + +speech=$1 +lexicon=$2 + +download_dir=$(pwd) +tmpdir=data/local/tmp +data_dir=$tmpdir/LDC2006S37/data + +mkdir -p $tmpdir + +# download the corpus from openslr + +if [ ! -f $download_dir/heroico.tar.gz ]; then + wget -O $download_dir/heroico.tar.gz $speech + + ( + cd $download_dir + tar -xzf heroico.tar.gz + ) +fi + +mkdir -p data/local/dict $tmpdir/dict + +# download the dictionary from openslr + +if [ ! -f $download_dir/santiago.tar.gz ]; then + wget -O $download_dir/santiago.tar.gz $lexicon +fi + +( + cd $download_dir + tar -xzf santiago.tar.gz +) diff --git a/egs/heroico/s5/local/heroico_recordings_make_lists.pl b/egs/heroico/s5/local/heroico_recordings_make_lists.pl index 1d157665799..b9a3ab5a565 100755 --- a/egs/heroico/s5/local/heroico_recordings_make_lists.pl +++ b/egs/heroico/s5/local/heroico_recordings_make_lists.pl @@ -19,75 +19,102 @@ system "mkdir -p $tmpdir/recordings/devtest"; # input wav file list -my $w = "$tmpdir/wav_list.txt"; +my $input_wav_list = "$tmpdir/wav_list.txt"; # output temporary wav.scp files -my $o_train = "$tmpdir/recordings/train/wav.scp"; -my $o_test = "$tmpdir/recordings/devtest/wav.scp"; +my $train_wav_scp = "$tmpdir/recordings/train/wav.scp"; +my $test_wav_scp = "$tmpdir/recordings/devtest/wav.scp"; # output temporary utt2spk files -my $u_train = "$tmpdir/recordings/train/utt2spk"; -my $u_test = "$tmpdir/recordings/devtest/utt2spk"; +my $train_uttspk = "$tmpdir/recordings/train/utt2spk"; +my $test_uttspk = "$tmpdir/recordings/devtest/utt2spk"; # output temporary text files -my $t_train = "$tmpdir/recordings/train/text"; -my $t_test = "$tmpdir/recordings/devtest/text"; +my $train_text = "$tmpdir/recordings/train/text"; +my $test_text = "$tmpdir/recordings/devtest/text"; # initialize hash for prompts -my %p = (); +my %prompts = (); # store prompts in hash LINEA: while ( my $line = <> ) { chomp $line; - my ($s,$sent) = split /\t/, $line, 2; - $p{$s} = $sent; + my ($prompt_id,$prompt) = split /\t/, $line, 2; + # pad the prompt id with zeroes + my $pid = ""; + if ( $prompt_id < 10 ) { + $pid = '0000' . $prompt_id; + } elsif ( $prompt_id < 100 ) { + $pid = '000' . $prompt_id; + } elsif ( $prompt_id < 1000 ) { + $pid = '00' . $prompt_id; + } + $prompts{$pid} = $prompt; } -open my $W, '<', $w or croak "problem with $w $!"; -open my $OT, '+>', $o_train or croak "problem with $o_train $!"; -open my $OE, '+>', $o_test or croak "problem with $o_test $!"; -open my $UT, '+>', $u_train or croak "problem with $u_train $!"; -open my $UE, '+>', $u_test or croak "problem with $u_test $!"; -open my $TT, '+>', $t_train or croak "problem with $t_train $!"; -open my $TE, '+>', $t_test or croak "problem with $t_test $!"; +open my $WVL, '<', $input_wav_list or croak "problem with $input_wav_list $!"; +open my $TRNWSCP, '+>', $train_wav_scp or croak "problem with $train_wav_scp $!"; +open my $TSTWSCP, '+>', $test_wav_scp or croak "problem with $test_wav_scp $!"; +open my $TRNUTTSPK, '+>', $train_uttspk or croak "problem with $train_uttspk $!"; +open my $TSTUTTSPK, '+>', $test_uttspk or croak "problem with $test_uttspk $!"; +open my $TRNTXT, '+>', $train_text or croak "problem with $train_text $!"; +open my $TSTTXT, '+>', $test_text or croak "problem with $test_text $!"; - LINE: while ( my $line = <$W> ) { + LINE: while ( my $line = <$WVL> ) { chomp $line; next LINE if ($line =~ /Answers/ ); next LINE unless ( $line =~ /Recordings/ ); my ($volume,$directories,$file) = File::Spec->splitpath( $line ); my @dirs = split /\//, $directories; - my $r = basename $line, ".wav"; - my $s = $dirs[-1]; - my $rid = $s . '_r' . '_' . $r; - if ( ( $r >= 355 ) and ( $r < 561 ) ) { - if ( exists $p{$r} ) { - print $TE "$rid $p{$r}\n"; - } elsif ( defined $rid ) { - warn "problem\t$rid"; + my $utt_id = basename $line, ".wav"; + # pad the utterance id with zeroes + my $utt = ""; + if ( $utt_id < 10 ) { + $utt = '0000' . $utt_id; +} elsif ( $utt_id < 100 ) { + $utt = '000' . $utt_id; +} elsif ( $utt_id < 1000 ) { + $utt = '00' . $utt_id; +} + my $spk_id = $dirs[-1]; + # pad the speaker id with zeroes + my $spk = ""; + if ( $spk_id < 10 ) { + $spk = '000' . $spk_id; + } elsif ( $spk_id < 100 ) { + $spk = '00' . $spk_id; + } elsif ( $spk_id < 1000 ) { + $spk = '0' . $spk_id; + } + my $spk_utt_id = $spk . '_' . $utt; + if ( ( $utt_id >= 355 ) and ( $utt_id < 561 ) ) { +if ( exists $prompts{$utt} ) { + print $TSTTXT "$spk_utt_id $prompts{$utt}\n"; + } elsif ( defined $spk_utt_id ) { + warn "problem\t$spk_utt_id"; next LINE; } else { croak "$line"; } - print $OE "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $UE "$rid ${s}_r\n"; - } elsif ( ( $r < 355 ) or ( $r > 560 ) ) { - if ( exists $p{$r} ) { - print $TT "$rid $p{$r}\n"; - } elsif ( defined $rid ) { - warn "problem\t$rid"; + print $TSTWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $TSTUTTSPK "$spk_utt_id $spk\n"; + } elsif ( ( $utt_id < 355 ) or ( $utt_id > 560 ) ) { + if ( exists $prompts{$utt} ) { + print $TRNTXT "$spk_utt_id $prompts{$utt}\n"; + } elsif ( defined $spk_utt_id ) { + warn "problem\t$spk_utt_id"; next LINE; } else { croak "$line"; } - print $OT "$rid sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; - print $UT "$rid ${s}_r\n"; - } + print $TRNWSCP "$spk_utt_id sox -r 22050 -e signed -b 16 $line -r 16000 -t wav - |\n"; + print $TRNUTTSPK "$spk_utt_id $spk\n"; + } } -close $TT; -close $OT; -close $UT; -close $TE; -close $OE; -close $UE; -close $W; +close $TRNTXT; +close $TRNWSCP; +close $TRNUTTSPK; +close $TSTTXT; +close $TSTWSCP; +close $TSTUTTSPK; +close $WVL; diff --git a/egs/heroico/s5/local/nnet3/run_ivector_common.sh b/egs/heroico/s5/local/nnet3/run_ivector_common.sh index 153f0073667..e882ce0c918 100755 --- a/egs/heroico/s5/local/nnet3/run_ivector_common.sh +++ b/egs/heroico/s5/local/nnet3/run_ivector_common.sh @@ -9,6 +9,9 @@ set -euo pipefail # of usage. stage=0 +nj=56 +num_threads_ubm=2 + train_set=train test_sets="native nonnative devtest test" gmm=tri3b @@ -37,25 +40,17 @@ if [ $stage -le 1 ]; then utils/data/perturb_data_dir_speed_3way.sh \ data/${train_set} \ data/${train_set}_sp - echo "$0: making MFCC features for low-resolution speed-perturbed data" - steps/make_mfcc.sh \ - --cmd "$train_cmd" \ - --nj 10 \ - data/${train_set}_sp || exit 1; - steps/compute_cmvn_stats.sh \ - data/${train_set}_sp || exit 1; - utils/fix_data_dir.sh \ - data/${train_set}_sp + + echo "$0: making mfcc features for low-resolution speed-perturbed data" + steps/make_mfcc.sh --cmd "$train_cmd" --nj 10 data/${train_set}_sp || exit 1; + steps/compute_cmvn_stats.sh data/${train_set}_sp || exit 1; + utils/fix_data_dir.sh data/${train_set}_sp fi if [ $stage -le 2 ]; then echo "$0: aligning with the perturbed low-resolution data" steps/align_fmllr.sh \ - --nj 20 \ - --cmd "$train_cmd" \ - data/${train_set}_sp \ - data/lang \ - $gmm_dir \ + --nj 20 --cmd "$train_cmd" data/${train_set}_sp data/lang $gmm_dir \ $ali_dir || exit 1 fi diff --git a/egs/heroico/s5/local/prepare_data.sh b/egs/heroico/s5/local/prepare_data.sh index db2b990c07b..b78d9f1d1cb 100755 --- a/egs/heroico/s5/local/prepare_data.sh +++ b/egs/heroico/s5/local/prepare_data.sh @@ -4,17 +4,17 @@ # Apache 2.0. . ./cmd.sh - . ./path.sh stage=0 +datadir=$1 . ./utils/parse_options.sh set -e set -o pipefail -# the location of the LDC corpus -datadir=$1 +tmpdir=data/local/tmp + # acoustic models are trained on the heroico corpus # testing is done on the usma corpus # heroico consists of 2 parts: answers and recordings (recited) @@ -25,8 +25,6 @@ recordings_transcripts=$datadir/data/transcripts/heroico-recordings.txt # usma is all recited usma_transcripts=$datadir/data/transcripts/usma-prompts.txt -tmpdir=data/local/tmp - # make acoustic model training lists if [ $stage -le 0 ]; then mkdir -p $tmpdir/heroico $tmpdir/usma @@ -37,12 +35,12 @@ if [ $stage -le 0 ]; then # the transcripts are converted to UTF8 export LC_ALL=en_US.UTF-8 cat $answers_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/heroico_answers_make_lists.pl + tr -d '\r' | local/heroico_answers_make_lists.pl utils/fix_data_dir.sh $tmpdir/heroico/answers cat $recordings_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/heroico_recordings_make_lists.pl + tr -d '\r' | local/heroico_recordings_make_lists.pl utils/fix_data_dir.sh $tmpdir/heroico/recordings/train utils/fix_data_dir.sh $tmpdir/heroico/recordings/devtest @@ -52,11 +50,11 @@ if [ $stage -le 0 ]; then for x in wav.scp utt2spk text; do cat $tmpdir/heroico/answers/$x $tmpdir/heroico/recordings/train/$x | \ - sed -e 's/\r//' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x + tr -d '\r' | sort -k1,1 -u >$tmpdir/heroico/lists/train/$x done for x in wav.scp utt2spk text; do - cat $tmpdir/heroico/recordings/devtest/$x | sed -e 's/\r//' | \ + cat $tmpdir/heroico/recordings/devtest/$x | tr -d '\r' | \ sort -k1,1 -u >$tmpdir/heroico/lists/devtest/$x done @@ -67,10 +65,10 @@ fi if [ $stage -le 1 ]; then # make separate lists for usma (US military academy) native and nonnative cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/usma_native_make_lists.pl + tr -d '\r' | dos2unix | local/usma_native_make_lists.pl cat $usma_transcripts | iconv -f ISO-8859-1 -t UTF-8 | \ - sed -e 's/\r//' | local/usma_nonnative_make_lists.pl + tr -d '\r' | local/usma_nonnative_make_lists.pl for n in native nonnative; do mkdir -p $tmpdir/usma/$n/lists @@ -86,14 +84,14 @@ if [ $stage -le 1 ]; then # get training lists for x in wav.scp utt2spk text; do cat $tmpdir/heroico/answers/${x} $tmpdir/heroico/recordings/train/${x} | \ - sed -e 's/\r//' >$tmpdir/lists/train/$x + tr -d '\r' >$tmpdir/lists/train/$x sort $tmpdir/lists/train/$x >data/train/$x done # get devtest lists for x in wav.scp utt2spk text; do cat $tmpdir/heroico/lists/devtest/$x | \ - sed -e 's/\r//' >$tmpdir/lists/devtest/$x + tr -d '\r' >$tmpdir/lists/devtest/$x sort $tmpdir/lists/devtest/$x >data/devtest/$x done diff --git a/egs/heroico/s5/local/prepare_dict.sh b/egs/heroico/s5/local/prepare_dict.sh index a6d182a6852..9f498bc963a 100755 --- a/egs/heroico/s5/local/prepare_dict.sh +++ b/egs/heroico/s5/local/prepare_dict.sh @@ -13,12 +13,12 @@ fi export LC_ALL=C -cut -f2- data/local/tmp/dict/santiago.txt | \ +cut -f2- ./santiago.txt | \ tr -s '[:space:]' '[\n*]' | \ grep -v SPN | sort -u >data/local/dict/nonsilence_phones.txt # sed "1d" deletes the last line. -expand -t 1 data/local/tmp/dict/santiago.txt | sort -u | +expand -t 1 ./santiago.txt | sort -u | sed "1d" >data/local/dict/lexicon.txt echo " SPN" >> data/local/dict/lexicon.txt diff --git a/egs/heroico/s5/local/subs_download.sh b/egs/heroico/s5/local/subs_download.sh new file mode 100755 index 00000000000..98dcb42d4e0 --- /dev/null +++ b/egs/heroico/s5/local/subs_download.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Copyright 2017 John Morgan +# Apache 2.0. + +tmpdir=data/local/tmp +download_dir=$(pwd) +mkdir -p $download_dir +subs_src=$1 + +# download the subs corpus +if [ ! -f $download_dir/subs.zip ]; then + wget -O $download_dir/subs.zip $subs_src + ( + cd $download_dir + unzip subs.zip + ) + else + echo "$0: subs file already downloaded." +fi diff --git a/egs/heroico/s5/local/subs_prepare_data.pl b/egs/heroico/s5/local/subs_prepare_data.pl index 3cd906d4699..a7e0cfb0c6e 100755 --- a/egs/heroico/s5/local/subs_prepare_data.pl +++ b/egs/heroico/s5/local/subs_prepare_data.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # Copyright 2017 John Morgan # Apache 2.0. @@ -12,69 +12,64 @@ use Encode; # set lower and upper bounds -my $lb = 8; -# only segments with at least $lb words will be written -my $ub = 16; -# only segments with fewer than $ub words will be written +my $low_bound = 8; +# only segments with at least $low_bound words will be written +my $up_bound = 16; +# only segments with fewer than $up_bound words will be written # input and output files -my $c = "data/local/tmp/subs/OpenSubtitles2016.en-es.es"; -my $symtab = "data/lang/words.txt"; -my $rl = "data/local/tmp/subs/lm/es.txt"; -my $oo = "data/local/tmp/subs/lm/oovs.txt"; + +my $corpus = "OpenSubtitles2018.en-es.es"; +my $symbol_table = "data/lang/words.txt"; +my $filtered = "data/local/tmp/subs/lm/es.txt"; +my $oovs = "data/local/tmp/subs/lm/oovs.txt"; my $iv = "data/local/tmp/subs/lm/in_vocabulary.txt"; -open my $C, '<', $c or croak "problems with $c $!"; +open my $C, '<', $corpus or croak "problems with $corpus $!"; system "mkdir -p data/local/tmp/subs/lm"; -open my $RL, '+>:utf8', $rl or croak "problems with $rl $!"; - -LINE: while ( my $line = <$C> ) { - $line = decode_utf8 $line; - chomp $line; - - my @tokens = split /\s+/, $line; - - next LINE if ( ($#tokens < $lb) or ($#tokens > $ub )); - - #remove control characters - #$line =~ s/(\p{Other})/ /g; - #$line =~ s/(\p{Control})/ /g; - #$line =~ s/(\p{Format})/ /g; - #$line =~ s/(\p{Private_Use})/ /g; - #$line =~ s/(\p{Surrogate})/ /g; - - # punctuation - $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; -#convert tabs to white space - $line =~ s/\t/ /g; - #hard to soft space - $line =~ s/ / /g; -#squeeze white space - $line =~ s/\s+/ /g; -#initial and final white space - $line =~ s/^\p{Separator}+//; - $line =~ s/\p{Separator}+$//; -#down case - $line = lc $line; - - - print $RL "$line\n"; - +if ( -e $filtered ) { + warn "$filtered already exists."; +} else { + open my $FLT, '+>:utf8', $filtered or croak "problems with $filtered $!"; + LINE: while ( my $line = <$C> ) { + $line = decode_utf8 $line; + chomp $line; + + my @tokens = split /\s+/, $line; + + next LINE if ( ($#tokens < $low_bound) or ($#tokens > $up_bound )); + + # remove punctuation + $line =~ s/(\p{Punctuation}+|\p{Dash_Punctuation}+|\p{Close_Punctuation}+|\p{Open_Punctuation}+|\p{Initial_Punctuation}+|\p{Final_Punctuation}+|\p{Connector_Punctuation}+|\p{Other_Punctuation}+|[ ]+)/ /msxg; + #convert tabs to white space + $line =~ s/\t/ /g; + #hard to soft space + $line =~ s/ / /g; + #squeeze white space + $line =~ s/\s+/ /g; + #initial and final white space + $line =~ s/^\p{Separator}+//; + $line =~ s/\p{Separator}+$//; + #down case + $line = lc $line; + + print $FLT "$line\n"; + } + close $FLT; } - close $C; -close $RL; + # find out of vocabulary words -# $symtab points to a file containing a map of symbols to integers +# $symbol_table points to a file containing a map of symbols to integers # hash for word to integer map my %sym2int = (); -open my $F, '<', $symtab or croak "problem with $symtab $!"; +open my $F, '<', $symbol_table or croak "problem with $symbol_table $!"; # store words to int map in hash while( my $line = <$F>) { @@ -84,33 +79,33 @@ } close $F; -open my $I, '<', $rl or croak "problem with $rl $!"; -open my $OO, '+>', $oo or croak "problems with $oo $!"; +open my $I, '<', $filtered or croak "problem with $filtered $!"; +open my $OOVS, '+>', $oovs or croak "problems with $oovs $!"; while ( my $line = <$I>) { chomp $line; my @A = split /\s/, $line; foreach my $a (@A) { if (!defined ($sym2int{$a})) { - print $OO "$a\n"; + print $OOVS "$a\n"; } } } -close $OO; +close $OOVS; close $I; # remove segments with OOVs # store OOVS in hash my %oov = (); -open my $V, '<', $oo or croak "problems with $oo $!"; +open my $V, '<', $oovs or croak "problems with $oovs $!"; while ( my $line = <$V> ) { chomp $line; $oov{$line} = 1; } close $V; -open my $L, '<', $rl or croak "problems with $rl $!"; +open my $L, '<', $filtered or croak "problems with $filtered $!"; open my $IV, '+>', $iv or croak "problems with $iv $!"; SEGMENT: while ( my $segment = <$L> ) { diff --git a/egs/heroico/s5/run.sh b/egs/heroico/s5/run.sh index 711bece3c66..67ad87e55f9 100755 --- a/egs/heroico/s5/run.sh +++ b/egs/heroico/s5/run.sh @@ -1,83 +1,72 @@ #!/bin/bash . ./cmd.sh - . ./path.sh + stage=0 +# the location of the LDC corpus; this location works for the CLSP grid. +datadir=/export/corpora5/LDC/LDC2006S37 + +# The corpus and lexicon are on openslr.org +speech="http://www.openslr.org/resources/39/LDC2006S37.tar.gz" +lexicon="http://www.openslr.org/resources/34/santiago.tar.gz" + +# Location of the Movie subtitles text corpus +subs_src="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2018/en-es.txt.zip" + . utils/parse_options.sh set -e set -o pipefail set -u -# the location of the LDC corpus; this location works for the CLSP grid. -datadir=/export/corpora5/LDC/LDC2006S37 - -#datadir=/mnt/corpora/LDC2006S37 -# location of subtitles text data -# note: this is not used so I'm commenting it out; dan. -#subsdata="http://opus.lingfil.uu.se/download.php?f=OpenSubtitles2016/en-es.txt.zip" -lexicon="http://www.openslr.org/resources/34/santiago.tar.gz" # don't change tmpdir, the location is used explicitly in scripts in local/. tmpdir=data/local/tmp if [ $stage -le 0 ]; then - # prepare the lists for acoustic model training and testing - mkdir -p $tmpdir/heroico - mkdir -p $tmpdir/usma - - [ ! -d "$datadir" ] && \ - echo "$0 Data directory (LDC corpus release) does not exist" && \ - exit 1 - local/prepare_data.sh $datadir + # download the corpus from openslr + local/heroico_download.sh $speech $lexicon + # Get data for lm training + local/subs_download.sh $subs_src fi if [ $stage -le 1 ]; then - # prepare a dictionary - mkdir -p data/local/dict - mkdir -p data/local/tmp/dict - - # download the dictionary from openslr - if [ ! -f data/local/tmp/dict/santiago.tar.gz ]; then - wget -O data/local/tmp/dict/santiago.tar.gz $lexicon - fi - - ( - cd $tmpdir/dict - tar -xzf santiago.tar.gz - ) + echo "Makin lists for building models." + local/prepare_data.sh $datadir +fi +if [ $stage -le 2 ]; then + mkdir -p data/local/dict $tmpdir/dict local/prepare_dict.sh +fi - # prepare the lang directory +if [ $stage -le 3 ]; then utils/prepare_lang.sh \ data/local/dict "" \ data/local/lang data/lang fi -if [ $stage -le 2 ]; then - # use am training text to train lm - mkdir -p $tmpdir/heroico/lm +if [ $stage -le 4 ]; then + mkdir -p $tmpdir/subs/lm + local/subs_prepare_data.pl +fi + +if [ $stage -le 5 ]; then echo "point 1" - # get the text from data/train/text - cut -d " " -f 2- data/train/text > $tmpdir/heroico/lm/train.txt - echo "point 2" - # build lm - local/prepare_lm.sh $tmpdir/heroico/lm/train.txt + local/prepare_lm.sh $tmpdir/subs/lm/in_vocabulary.txt +fi - echo "point 3" +if [ $stage -le 6 ]; then + echo "point 2" utils/format_lm.sh \ data/lang data/local/lm/trigram.arpa.gz data/local/dict/lexicon.txt \ data/lang_test - - # delete temporary work - rm -rf data/local/tmp fi -if [ $stage -le 3 ]; then - # extract acoustic features +if [ $stage -le 7 ]; then + echo "$0: extracting acoustic features." mkdir -p exp for fld in native nonnative test devtest train; do @@ -92,7 +81,7 @@ if [ $stage -le 3 ]; then done fi -if [ $stage -le 4 ]; then +if [ $stage -le 8 ]; then echo "$0 monophone training" steps/train_mono.sh --nj 8 --cmd "$train_cmd" data/train data/lang exp/mono || exit 1; @@ -108,8 +97,7 @@ if [ $stage -le 4 ]; then ) & fi -if [ $stage -le 5 ]; then - +if [ $stage -le 9 ]; then # align with monophones steps/align_si.sh --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/mono exp/mono_ali @@ -131,10 +119,8 @@ if [ $stage -le 5 ]; then fi -if [ $stage -le 6 ]; then +if [ $stage -le 10 ]; then echo "$0: Starting delta system alignment" - - # align with triphones steps/align_si.sh \ --nj 8 --cmd "$train_cmd" data/train data/lang exp/tri1 exp/tri1_ali @@ -156,10 +142,9 @@ if [ $stage -le 6 ]; then ) & fi -if [ $stage -le 7 ]; then +if [ $stage -le 11 ]; then echo "$0: Starting LDA+MLLT system alignment" - # align with lda and mllt adapted triphones steps/align_si.sh \ --use-graphs true --nj 8 --cmd "$train_cmd" \ data/train data/lang exp/tri2b exp/tri2b_ali @@ -169,7 +154,6 @@ if [ $stage -le 7 ]; then --cmd "$train_cmd" \ 3100 50000 data/train data/lang exp/tri2b_ali exp/tri3b - # align with tri3b models echo "$0 Starting exp/tri3b_ali" steps/align_fmllr.sh \ --nj 8 --cmd "$train_cmd" \ @@ -182,16 +166,16 @@ if [ $stage -le 7 ]; then utils/mkgraph.sh \ data/lang_test exp/tri3b exp/tri3b/graph || exit 1; - # decode test sets with tri3b models for x in native nonnative devtest test; do + echo "$0: decoding $x with tri3b models." steps/decode_fmllr.sh \ --nj 8 --cmd "$decode_cmd" exp/tri3b/graph data/$x exp/tri3b/decode_${x} done ) & fi -if [ $stage -le 9 ]; then - # train and test chain models +if [ $stage -le 12 ]; then + echo "$0: train and test chain models." local/chain/run_tdnn.sh fi diff --git a/egs/iam/v1/run_end2end.sh b/egs/iam/v1/run_end2end.sh index 294e41cbc85..6df93e739f4 100755 --- a/egs/iam/v1/run_end2end.sh +++ b/egs/iam/v1/run_end2end.sh @@ -14,7 +14,7 @@ iam_database=/export/corpora5/handwriting_ocr/IAM # wellington_database points to the database path on the JHU grid. The Wellington # corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). # This corpus is of written NZ English that can be purchased here: -# "https://www.victoria.ac.nz/lals/resources/corpora-default" +# "https://www.victoria.ac.nz/lals/resources/corpora-default" wellington_database=/export/corpora5/Wellington/WWC/ . ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. @@ -90,7 +90,8 @@ fi if [ $stage -le 5 ]; then echo "$0: Aligning the training data using the e2e chain model..." steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi diff --git a/egs/iam/v2/cmd.sh b/egs/iam/v2/cmd.sh new file mode 100644 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/iam/v2/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/iam/v2/image b/egs/iam/v2/image new file mode 120000 index 00000000000..1668ee99922 --- /dev/null +++ b/egs/iam/v2/image @@ -0,0 +1 @@ +../../cifar/v1/image/ \ No newline at end of file diff --git a/egs/iam/v2/local/chain/compare_wer.sh b/egs/iam/v2/local/chain/compare_wer.sh new file mode 100755 index 00000000000..d4076457463 --- /dev/null +++ b/egs/iam/v2/local/chain/compare_wer.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/iam/v2/local/chain/run_cnn_e2eali.sh b/egs/iam/v2/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..ad51803ab0e --- /dev/null +++ b/egs/iam/v2/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1c.sh \ No newline at end of file diff --git a/egs/iam/v2/local/chain/run_e2e_cnn.sh b/egs/iam/v2/local/chain/run_e2e_cnn.sh new file mode 100755 index 00000000000..15bdf610cd3 --- /dev/null +++ b/egs/iam/v2/local/chain/run_e2e_cnn.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) + +# local/chain/compare_wer.sh exp/chain/cnn_1a exp/chain/cnn_chainali_1c exp/chain/e2e_cnn_1a +# System cnn_1a cnn_chainali_1c e2e_cnn_1a +# WER 18.52 12.72 12.15 +# CER 10.07 5.99 6.03 +# Final train prob -0.0077 -0.0291 -0.0371 +# Final valid prob -0.0970 -0.0359 -0.0636 +# Final train prob (xent) -0.5484 -0.9781 +# Final valid prob (xent) -0.9643 -1.1544 +# Parameters 4.36M 3.96M 9.13M + +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=21 nj=2..4 num-params=9.1M dim=40->12640 combine=-0.033->-0.033 (over 1) logprob:train/valid[13,20,final]=(-0.058,-0.042,-0.035/-0.070,-0.064,-0.059) + +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a + +# training options +tdnn_dim=450 +num_epochs=4 +num_jobs_initial=2 +num_jobs_final=4 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +l2_regularize=0.00005 +frames_per_iter=1000000 +cmvn_opts="--norm-means=true --norm-vars=true" +train_set=train +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts "$cmvn_opts" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize $l2_regularize \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter $frames_per_iter \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial $num_jobs_initial \ + --trainer.optimization.num-jobs-final $num_jobs_final \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj 30 --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..ba28f681708 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,245 @@ +#!/bin/bash + +# e2eali_1a is the same as chainali_1c but uses the e2e chain model to get the +# lattice alignments and to build a tree + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a exp/chain/cnn_chainali_1c exp/chain/cnn_e2eali_1a +# System e2e_cnn_1a cnn_chainali_1c cnn_e2eali_1a +# WER 13.87 12.72 12.70 +# CER 6.54 5.99 5.75 +# Final train prob -0.0371 -0.0291 -0.0557 +# Final valid prob -0.0636 -0.0359 -0.0770 +# Final train prob (xent) -0.9781 -0.8847 +# Final valid prob (xent) -1.1544 -1.0370 +# Parameters 9.13M 3.96M 3.95M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.056->-0.056 (over 1) xent:train/valid[13,20,final]=(-1.47,-0.978,-0.918/-1.54,-1.10,-1.06) logprob:train/valid[13,20,final]=(-0.106,-0.065,-0.056/-0.113,-0.086,-0.079) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_test=lang_unk +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/$lang_test \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; +fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh new file mode 100755 index 00000000000..298e7053086 --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1b.sh @@ -0,0 +1,251 @@ +#!/bin/bash + +# e2eali_1b is the same as e2eali_1a but uses unconstrained egs + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1a exp/chain/cnn_e2eali_1b +# System cnn_e2eali_1a cnn_e2eali_1b +# WER 10.40 10.33 +# WER (rescored) 10.02 10.10 +# CER 4.97 5.00 +# CER (rescored) 4.83 4.88 +# Final train prob -0.0612 -0.0428 +# Final valid prob -0.0857 -0.0666 +# Final train prob (xent) -0.8990 -0.9210 +# Final valid prob (xent) -1.0024 -1.0264 +# Parameters 3.98M 3.98M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1b +# exp/chain/cnn_e2eali_1b: num-iters=21 nj=2..4 num-params=4.0M dim=40->360 combine=-0.038->-0.038 (over 1) xent:train/valid[13,20,final]=(-1.34,-0.967,-0.838/-1.40,-1.07,-0.985) logprob:train/valid[13,20,final]=(-0.075,-0.054,-0.037/-0.083,-0.072,-0.059) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=450 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.075" + tdnn_opts="l2-regularize=0.075" + output_opts="l2-regularize=0.1" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn6 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + conv-relu-batchnorm-layer name=cnn7 height-in=10 height-out=10 time-offsets=-1,0,1 $common3 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=4 \ + --trainer.frames-per-iter=1000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh new file mode 100755 index 00000000000..ef851c8ae2f --- /dev/null +++ b/egs/iam/v2/local/chain/tuning/run_cnn_e2eali_1c.sh @@ -0,0 +1,253 @@ +#!/bin/bash + +# e2eali_1c is the same as e2eali_1b but has fewer CNN layers, smaller +# l2-regularize, more epochs and uses dropout. + + +# local/chain/compare_wer.sh exp/chain/cnn_e2eali_1b exp/chain/cnn_e2eali_1c +# System cnn_e2eali_1b cnn_e2eali_1c +# WER 10.33 10.05 +# WER (rescored) 10.10 9.75 +# CER 5.00 4.76 +# CER (rescored) 4.88 4.68 +# Final train prob -0.0428 -0.0317 +# Final valid prob -0.0666 -0.0630 +# Final train prob (xent) -0.9210 -0.5413 +# Final valid prob (xent) -1.0264 -0.7096 +# Parameters 3.98M 5.12M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1c +# exp/chain/cnn_e2eali_1c: num-iters=21 nj=2..4 num-params=5.1M dim=40->392 combine=-0.034->-0.034 (over 1) xent:train/valid[13,20,final]=(-0.953,-0.800,-0.541/-1.03,-0.933,-0.710) logprob:train/valid[13,20,final]=(-0.069,-0.048,-0.032/-0.091,-0.078,-0.063) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1b6 #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g + +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=500" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=8 \ + --trainer.frames-per-iter=2000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=4 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=64,32 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/test $dir/decode_test || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/test $dir/decode_test{,_rescored} || exit 1 +fi diff --git a/egs/iam/v2/local/check_tools.sh b/egs/iam/v2/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/iam/v2/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/iam/v2/local/make_features.py b/egs/iam/v2/local/make_features.py new file mode 100755 index 00000000000..84e012daedb --- /dev/null +++ b/egs/iam/v2/local/make_features.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2018 Hossein Hadian + +""" This script converts images to Kaldi-format feature matrices. The input to + this script is the path to a data directory, e.g. "data/train". This script + reads the images listed in images.scp and writes them to standard output + (by default) as Kaldi-formatted matrices (in text form). It also scales the + images so they have the same height (via --feat-dim). It can optionally pad + the images (on left/right sides) with white pixels. + If an 'image2num_frames' file is found in the data dir, it will be used + to enforce the images to have the specified length in that file by padding + white pixels (the --padding option will be ignored in this case). This relates + to end2end chain training. + + eg. local/make_features.py data/train --feat-dim 40 +""" + +import argparse +import os +import sys +import numpy as np +from scipy import misc + +parser = argparse.ArgumentParser(description="""Converts images (in 'dir'/images.scp) to features and + writes them to standard output in text format.""") +parser.add_argument('dir', type=str, + help='Source data directory (containing images.scp)') +parser.add_argument('--out-ark', type=str, default='-', + help='Where to write the output feature file') +parser.add_argument('--feat-dim', type=int, default=40, + help='Size to scale the height of all images') +parser.add_argument('--padding', type=int, default=5, + help='Number of white pixels to pad on the left' + 'and right side of the image.') + + +args = parser.parse_args() + + +def write_kaldi_matrix(file_handle, matrix, key): + file_handle.write(key + " [ ") + num_rows = len(matrix) + if num_rows == 0: + raise Exception("Matrix is empty") + num_cols = len(matrix[0]) + + for row_index in range(len(matrix)): + if num_cols != len(matrix[row_index]): + raise Exception("All the rows of a matrix are expected to " + "have the same length") + file_handle.write(" ".join(map(lambda x: str(x), matrix[row_index]))) + if row_index != num_rows - 1: + file_handle.write("\n") + file_handle.write(" ]\n") + +def get_scaled_image(im, allowed_lengths = None): + scale_size = args.feat_dim + sx = im.shape[1] + sy = im.shape[0] + scale = (1.0 * scale_size) / sy + nx = int(scale_size) + ny = int(scale * sx) + im = misc.imresize(im, (nx, ny)) + if allowed_lengths is None: + left_padding = right_padding = args.padding + else: # Find an allowed length for the image + imlen = im.shape[1] + allowed_len = 0 + for l in allowed_lengths: + if l > imlen: + allowed_len = l + break + if allowed_len == 0: + # No allowed length was found for the image (the image is too long) + return None + padding = allowed_len - imlen + left_padding = padding // 2 + right_padding = padding - left_padding + dim_y = im.shape[0] + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + return im_pad1 + +### main ### +data_list_path = os.path.join(args.dir, 'images.scp') + +if args.out_ark == '-': + out_fh = sys.stdout +else: + out_fh = open(args.out_ark,'wb') + +allowed_lengths = None +if os.path.isfile(os.path.join(args.dir, 'allowed_lengths.txt')): + print("Found 'allowed_lengths.txt' file...", file=sys.stderr) + allowed_lengths = [] + with open(os.path.join(args.dir,'allowed_lengths.txt')) as f: + for line in f: + allowed_lengths.append(int(line.strip())) + print("Read {} allowed lengths and will apply them to the " + "features.".format(len(allowed_lengths)), file=sys.stderr) + +num_fail = 0 +num_ok = 0 +with open(data_list_path) as f: + for line in f: + line = line.strip() + line_vect = line.split(' ') + image_id = line_vect[0] + image_path = line_vect[1] + im = misc.imread(image_path) + im_scaled = get_scaled_image(im, allowed_lengths) + + if im_scaled is None: + num_fail += 1 + continue + data = np.transpose(im_scaled, (1, 0)) + data = np.divide(data, 255.0) + num_ok += 1 + write_kaldi_matrix(out_fh, data, image_id) + +print('Generated features for {} images. Failed for {} (iamge too ' + 'long).'.format(num_ok, num_fail), file=sys.stderr) diff --git a/egs/iam/v2/local/prepare_data.sh b/egs/iam/v2/local/prepare_data.sh new file mode 100755 index 00000000000..73d711c73f0 --- /dev/null +++ b/egs/iam/v2/local/prepare_data.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 + +# This script downloads the IAM handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling process_data.py. +# It also downloads the LOB and Brown text corpora. It downloads the database files +# only if they do not already exist in download directory. + +# Eg. local/prepare_data.sh +# Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from +# utt2spk file: 000_a01-000u-00 000 +# images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +# spk2utt file: 000 000_a01-000u-00 000_a01-000u-01 000_a01-000u-02 000_a01-000u-03 + +stage=0 +download_dir=data/download +wellington_dir= +username= +password= # username and password for downloading the IAM database + # if you have not already downloaded the database, please + # register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database + # and provide this script with your username and password. + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +if [[ ! -f $download_dir/lines.tgz && -z $username ]]; then + echo "$0: Warning: Couldn't find lines.tgz in $download_dir. Unless the extracted dataset files" + echo "exist in your data/local directory this script will fail because the required files" + echo "can't be downloaded automatically (it needs registration)." + echo "Please register at http://www.fki.inf.unibe.ch/databases/iam-handwriting-database" + echo "... and then call this script again with --username --password " + echo "" + exit 1 +fi + +lines=data/local/lines +xml=data/local/xml +ascii=data/local/ascii +bcorpus=data/local/browncorpus +lobcorpus=data/local/lobcorpus +wcorpus=data/local/wellingtoncorpus +data_split_info=data/local/largeWriterIndependentTextLineRecognitionTask +lines_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/lines/lines.tgz +xml_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/xml/xml.tgz +data_split_info_url=http://www.fki.inf.unibe.ch/DBs/iamDB/tasks/largeWriterIndependentTextLineRecognitionTask.zip +ascii_url=http://www.fki.inf.unibe.ch/DBs/iamDB/data/ascii/ascii.tgz +brown_corpus_url=http://www.sls.hawaii.edu/bley-vroman/brown.txt +lob_corpus_url=http://ota.ox.ac.uk/text/0167.zip +wellington_corpus_loc=/export/corpora5/Wellington/WWC/ +mkdir -p $download_dir data/local + +# download and extact images and transcription +if [ -d $lines ]; then + echo "$0: Not downloading lines images as it is already there." +else + if [ ! -f $download_dir/lines.tgz ]; then + echo "$0: Trying to download lines images..." + wget -P $download_dir --user "$username" --password "$password" $lines_url || exit 1; + fi + mkdir -p $lines + tar -xzf $download_dir/lines.tgz -C $lines || exit 1; + echo "$0: Done downloading and extracting lines images" +fi + +if [ -d $xml ]; then + echo "$0: Not downloading transcriptions as it is already there." +else + if [ ! -f $download_dir/xml.tgz ]; then + echo "$0: Trying to download transcriptions..." + wget -P $download_dir --user "$username" --password "$password" $xml_url || exit 1; + fi + mkdir -p $xml + tar -xzf $download_dir/xml.tgz -C $xml || exit 1; + echo "$0: Done downloading and extracting transcriptions." +fi + +if [ -d $data_split_info ]; then + echo "$0: Not downloading data split information as it is already there." +else + if [ ! -f $download_dir/largeWriterIndependentTextLineRecognitionTask.zip ]; then + echo "$0: Trying to download training and testing data split information..." + wget -P $download_dir --user "$username" --password "$password" $data_split_info_url || exit 1; + fi + mkdir -p $data_split_info + unzip $download_dir/largeWriterIndependentTextLineRecognitionTask.zip -d $data_split_info || exit 1; + echo "$0: Done downloading and extracting training and testing data split information" +fi + +if [ -d $ascii ]; then + echo "$0: Not downloading ascii.tgz as it is already there." +else + if [ ! -f $download_dir/ascii.tgz ]; then + echo "$0: trying to download ascii.tgz..." + wget -P $download_dir --user "$username" --password "$password" $ascii_url || exit 1; + fi + mkdir -p $ascii + tar -xzf $download_dir/ascii.tgz -C $ascii || exit 1; + echo "$0: Done downloading and extracting ascii.tgz" +fi + +if [ -d $lobcorpus ]; then + echo "$0: Not downloading the LOB text corpus as it is already there." +else + if [ ! -f $lobcorpus/0167.zip ]; then + echo "$0: Downloading the LOB text corpus ..." + mkdir -p $lobcorpus + wget -P $lobcorpus/ $lob_corpus_url || exit 1; + fi + unzip $lobcorpus/0167.zip -d $lobcorpus || exit 1; + echo "$0: Done downloading and extracting LOB corpus" +fi + +if [ -d $bcorpus ]; then + echo "$0: Not downloading the Brown corpus as it is already there." +else + if [ ! -f $bcorpus/brown.txt ]; then + mkdir -p $bcorpus + echo "$0: Downloading the Brown text corpus..." + wget -P $bcorpus $brown_corpus_url || exit 1; + fi + echo "$0: Done downloading the Brown text corpus" +fi + +if [ -d $wcorpus ]; then + echo "$0: Not copying Wellington corpus as it is already there." +elif [ ! -z $wellington_dir ]; then + mkdir -p $wcorpus + cp -r $wellington_dir/. $wcorpus + + # Combine Wellington corpora and replace some of their annotations + cat data/local/wellingtoncorpus/Section{A,B,C,D,E,F,G,H,J,K,L}.txt | \ + cut -d' ' -f3- | sed "s/^[ \t]*//" > data/local/wellingtoncorpus/Wellington_annotated.txt + + cat data/local/wellingtoncorpus/Wellington_annotated.txt | local/remove_wellington_annotations.py > data/local/wellingtoncorpus/Wellington_annotation_removed.txt + + echo "$0: Done copying Wellington corpus" +else + echo "$0: Wellington Corpus not included because wellington_dir not provided" +fi + +mkdir -p data/{train,test,val} +file_name=largeWriterIndependentTextLineRecognitionTask + +train_old="data/local/$file_name/trainset.txt" +test_old="data/local/$file_name/testset.txt" +val1_old="data/local/$file_name/validationset1.txt" +val2_old="data/local/$file_name/validationset2.txt" + +train_new="data/local/train.uttlist" +test_new="data/local/test.uttlist" +val_new="data/local/validation.uttlist" + +cat $train_old > $train_new +cat $test_old > $test_new +cat $val1_old $val2_old > $val_new + +if [ $stage -le 0 ]; then + local/process_data.py data/local data/train --dataset train || exit 1 + local/process_data.py data/local data/test --dataset test || exit 1 + local/process_data.py data/local data/val --dataset validation || exit 1 + + utils/utt2spk_to_spk2utt.pl data/train/utt2spk > data/train/spk2utt + utils/utt2spk_to_spk2utt.pl data/test/utt2spk > data/test/spk2utt +fi diff --git a/egs/iam/v2/local/prepare_dict.sh b/egs/iam/v2/local/prepare_dict.sh new file mode 100755 index 00000000000..e21a59c7e92 --- /dev/null +++ b/egs/iam/v2/local/prepare_dict.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +vocab_size=50000 +. ./utils/parse_options.sh + +mkdir -p $dir + +# First get the set of all letters that occur in data/train/text +cat data/train/text | \ + perl -ne '@A = split; shift @A; for(@A) {print join("\n", split(//)), "\n";}' | \ + sort -u | grep -v "|" > $dir/nonsilence_phones.txt + +# Now use the pocolm's wordlist which is the most N frequent words in +# in data/train/text and LOB+Brown corpora (dev and test excluded) with their comprising +# letters as their transcription. Only include words that use the above letters. +# (Letter # is replaced with ) + +export letters=$(cat $dir/nonsilence_phones.txt | tr -d "\n") + +head -n $vocab_size data/local/local_lm/data/word_count | awk '{print $2}' | \ + perl -e '$letters=$ENV{letters}; $letters=$letters . "|"; +while(<>){ + chop; + $w = $_; + if($w =~ m/^[$letters]+$/){ + $trans = join(" ", split(//, $w)); + $trans =~ s/#//g; + $trans =~ s/\|/SIL/g; + print "$w $trans\n"; + } +}' | sort -u > $dir/lexicon.txt + + +sed -i "s/#//" $dir/nonsilence_phones.txt + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/iam/v2/local/prepend_words.py b/egs/iam/v2/local/prepend_words.py new file mode 100755 index 00000000000..d53eb8974bf --- /dev/null +++ b/egs/iam/v2/local/prepend_words.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# This script, prepend '|' to every words in the transcript to mark +# the beginning of the words for finding the initial-space of every word +# after decoding. + +import sys, io + +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8') +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') +for line in infile: + output.write(' '.join(["|" + word for word in line.split()]) + '\n') diff --git a/egs/iam/v2/local/process_data.py b/egs/iam/v2/local/process_data.py new file mode 100755 index 00000000000..fa5eb484707 --- /dev/null +++ b/egs/iam/v2/local/process_data.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +""" This script reads the extracted IAM database files and creates + the following files (for the data subset selected via --dataset): + text, utt2spk, images.scp. + + Eg. local/process_data.py data/local data/train data --dataset train + Eg. text file: 000_a01-000u-00 A MOVE to stop Mr. Gaitskell from + utt2spk file: 000_a01-000u-00 000 + images.scp file: 000_a01-000u-00 data/local/lines/a01/a01-000u/a01-000u-00.png +""" + +import argparse +import os +import sys +import xml.dom.minidom as minidom + +parser = argparse.ArgumentParser(description="""Creates text, utt2spk + and images.scp files.""") +parser.add_argument('database_path', type=str, + help='Path to the downloaded (and extracted) IAM data') +parser.add_argument('out_dir', type=str, + help='Where to write output files.') +parser.add_argument('--dataset', type=str, default='train', + choices=['train', 'test','validation'], + help='Subset of data to process.') +args = parser.parse_args() + +text_file = os.path.join(args.out_dir + '/', 'text') +text_fh = open(text_file, 'w') + +utt2spk_file = os.path.join(args.out_dir + '/', 'utt2spk') +utt2spk_fh = open(utt2spk_file, 'w') + +image_file = os.path.join(args.out_dir + '/', 'images.scp') +image_fh = open(image_file, 'w') + +dataset_path = os.path.join(args.database_path, + args.dataset + '.uttlist') + +text_file_path = os.path.join(args.database_path, + 'ascii','lines.txt') +text_dict = {} +def process_text_file_for_word_model(): + with open (text_file_path, 'rt') as in_file: + for line in in_file: + if line[0]=='#': + continue + line = line.strip() + utt_id = line.split(' ')[0] + text_vect = line.split(' ')[8:] + text = "".join(text_vect) + text = text.replace("|", " ") + text_dict[utt_id] = text + +print("Processing '{}' data...".format(args.dataset)) +process_text_file_for_word_model() + +with open(dataset_path) as f: + for line in f: + line = line.strip() + line_vect = line.split('-') + xml_file = line_vect[0] + '-' + line_vect[1] + xml_path = os.path.join(args.database_path, 'xml', xml_file + '.xml') + img_num = line[-3:] + doc = minidom.parse(xml_path) + + form_elements = doc.getElementsByTagName('form')[0] + writer_id = form_elements.getAttribute('writer-id') + outerfolder = form_elements.getAttribute('id')[0:3] + innerfolder = form_elements.getAttribute('id') + lines_path = os.path.join(args.database_path, 'lines', + outerfolder, innerfolder, innerfolder) + image_file_path = lines_path + img_num + '.png' + text = text_dict[line] + utt_id = writer_id + '_' + line + text_fh.write(utt_id + ' ' + text + '\n') + utt2spk_fh.write(utt_id + ' ' + writer_id + '\n') + image_fh.write(utt_id + ' ' + image_file_path + '\n') diff --git a/egs/iam/v2/local/remove_test_utterances_from_lob.py b/egs/iam/v2/local/remove_test_utterances_from_lob.py new file mode 100755 index 00000000000..1b414ef47f6 --- /dev/null +++ b/egs/iam/v2/local/remove_test_utterances_from_lob.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +# Copyright 2018 Ashish Arora + +import argparse +import os +import numpy as np +import sys +import re + +parser = argparse.ArgumentParser(description="""Removes dev/test set lines + from the LOB corpus. Reads the + corpus from stdin, and writes it to stdout.""") +parser.add_argument('dev_text', type=str, + help='dev transcription location.') +parser.add_argument('test_text', type=str, + help='test transcription location.') +args = parser.parse_args() + +def remove_punctuations(transcript): + char_list = [] + for char in transcript: + if char.isdigit() or char == '+' or char == '~' or char == '?': + continue + if char == '#' or char == '=' or char == '-' or char == '!': + continue + if char == ',' or char == '.' or char == ')' or char == '\'': + continue + if char == '(' or char == ':' or char == ';' or char == '"': + continue + char_list.append(char) + return char_list + + +def remove_special_words(words): + word_list = [] + for word in words: + if word == '' or word == '#': + continue + word_list.append(word) + return word_list + + +# process and add dev/eval transcript in a list +# remove special words, punctuations, spaces between words +# lowercase the characters +def read_utterances(text_file_path): + with open(text_file_path, 'rt') as in_file: + for line in in_file: + words = line.strip().split() + words_wo_sw = remove_special_words(words) + transcript = ''.join(words_wo_sw[1:]) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + utterance_dict[words_wo_sw[0]] = transcript + + +### main ### + +# read utterances and add it to utterance_dict +utterance_dict = dict() +read_utterances(args.dev_text) +read_utterances(args.test_text) + +# read corpus and add it to below lists +corpus_text_lowercase_wo_sc = list() +corpus_text_wo_sc = list() +original_corpus_text = list() +for line in sys.stdin: + original_corpus_text.append(line) + words = line.strip().split() + words_wo_sw = remove_special_words(words) + + transcript = ''.join(words_wo_sw) + transcript = transcript.lower() + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_lowercase_wo_sc.append(transcript) + + transcript = ''.join(words_wo_sw) + trans_wo_punct = remove_punctuations(transcript) + transcript = ''.join(trans_wo_punct) + corpus_text_wo_sc.append(transcript) + +# find majority of utterances below +# for utterances which were not found +# add them to remaining_utterances +row_to_keep = [True for i in range(len(original_corpus_text))] +remaining_utterances = dict() +for line_id, line_to_find in utterance_dict.items(): + found_line = False + for i in range(1, (len(corpus_text_lowercase_wo_sc) - 2)): + # Combine 3 consecutive lines of the corpus into a single line + prev_words = corpus_text_lowercase_wo_sc[i - 1].strip() + curr_words = corpus_text_lowercase_wo_sc[i].strip() + next_words = corpus_text_lowercase_wo_sc[i + 1].strip() + new_line = prev_words + curr_words + next_words + transcript = ''.join(new_line) + if line_to_find in transcript: + found_line = True + row_to_keep[i-1] = False + row_to_keep[i] = False + row_to_keep[i+1] = False + if not found_line: + remaining_utterances[line_id] = line_to_find + + +for i in range(len(original_corpus_text)): + transcript = original_corpus_text[i].strip() + if row_to_keep[i]: + print(transcript) + +print('Sentences not removed from LOB: {}'.format(remaining_utterances), file=sys.stderr) +print('Total test+dev sentences: {}'.format(len(utterance_dict)), file=sys.stderr) +print('Number of sentences not removed from LOB: {}'. format(len(remaining_utterances)), file=sys.stderr) +print('LOB lines: Before: {} After: {}'.format(len(original_corpus_text), + row_to_keep.count(True)), file=sys.stderr) diff --git a/egs/iam/v2/local/remove_wellington_annotations.py b/egs/iam/v2/local/remove_wellington_annotations.py new file mode 100755 index 00000000000..260a3542985 --- /dev/null +++ b/egs/iam/v2/local/remove_wellington_annotations.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# Copyright 2018 Chun-Chieh Chang + +import sys +import io +import re +from collections import OrderedDict + +sys.stdin = io.TextIOWrapper(sys.stdin.buffer, encoding="utf8"); +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf8"); + +prev2_line = " "; +prev_line = " "; +for line in sys.stdin: + line = line.strip() + pattern = re.compile("\\*\\*\\[.*?\\*\\*\\]|\\*[0-9]|\\\\[0-9]{0,2}|\\*\\*?[\|,\?,\#,\=,\;,\:,\<,\>]|\||\^") + line_fixed = pattern.sub("", line) + dict=OrderedDict([("*+$","$"), ("*+","£"), ("*-","-"), ("*/","*"), ("*{","{"), ("*}","}"), + ("**\"","\""), ("*\"","\""), ("**'","'"), ("*'","'"), ("*@","°")]) + pattern = re.compile("|".join(re.escape(key) for key in dict.keys())); + line_fixed = pattern.sub(lambda x: dict[x.group()], line_fixed) + + line_fixed = prev2_line + "\n" + prev_line + "\n" + line_fixed + + pattern = re.compile("\{[0-9]{0,2}(.*?)\}", re.DOTALL) + line_fixed = pattern.sub(lambda x: x.group(1), line_fixed) + + output, prev2_line, prev_line = line_fixed.split("\n") + + sys.stdout.write(output + "\n") +sys.stdout.write(prev2_line + "\n") +sys.stdout.write(prev_line + "\n") diff --git a/egs/iam/v2/local/score.sh b/egs/iam/v2/local/score.sh new file mode 100755 index 00000000000..b2032909333 --- /dev/null +++ b/egs/iam/v2/local/score.sh @@ -0,0 +1,155 @@ +#!/bin/bash +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey, Yenda Trmal) +# Apache 2.0 + +# This script is like steps/scoring/score_kaldi_wer.sh except it transcribes the 's +# using local/unk_arc_post_to_transcription.py and also it calls +# steps/scoring/score_kaldi_cer.sh at the end. + +[ -f ./path.sh ] && . ./path.sh + +# begin configuration section. +cmd=run.pl +stage=0 +decode_mbr=false +stats=true +beam=6 +word_ins_penalty=0.0,0.5,1.0 +min_lmwt=3 +max_lmwt=13 +iter=final +#end configuration section. + +echo "$0 $@" # Print the command line for logging +[ -f ./path.sh ] && . ./path.sh +. parse_options.sh || exit 1; + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--cmd (run.pl|queue.pl...)] " + echo " Options:" + echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." + echo " --stage (0|1|2) # start scoring script from part-way through." + echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." + echo " --min_lmwt # minumum LM-weight for lattice rescoring " + echo " --max_lmwt # maximum LM-weight for lattice rescoring " + exit 1; +fi + +data=$1 +lang_or_graph=$2 +dir=$3 +model_path=`echo $dir |xargs dirname` +symtab=$lang_or_graph/words.txt + +for f in $symtab $dir/lat.1.gz $data/text; do + [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; +done + + +ref_filtering_cmd="cat" +[ -x local/wer_output_filter ] && ref_filtering_cmd="local/wer_output_filter" +[ -x local/wer_ref_filter ] && ref_filtering_cmd="local/wer_ref_filter" +hyp_filtering_cmd="cat" +[ -x local/wer_output_filter ] && hyp_filtering_cmd="local/wer_output_filter" +[ -x local/wer_hyp_filter ] && hyp_filtering_cmd="local/wer_hyp_filter" + + +if $decode_mbr ; then + echo "$0: scoring with MBR, word insertion penalty=$word_ins_penalty" +else + echo "$0: scoring with word insertion penalty=$word_ins_penalty" +fi + + +mkdir -p $dir/scoring_kaldi +cat $data/text | $ref_filtering_cmd > $dir/scoring_kaldi/test_filt.txt || exit 1; +if [ $stage -le 0 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + mkdir -p $dir/scoring_kaldi/penalty_$wip/log + + if $decode_mbr ; then + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + acwt=\`perl -e \"print 1.0/LMWT\"\`\; \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-prune --beam=$beam ark:- ark:- \| \ + lattice-mbr-decode --word-symbol-table=$symtab \ + ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + + else + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/best_path.LMWT.log \ + lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ + lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ + lattice-best-path --word-symbol-table=$symtab ark:- ark,t:- \| \ + utils/int2sym.pl -f 2- $symtab \| \ + $hyp_filtering_cmd '>' $dir/scoring_kaldi/penalty_$wip/LMWT.txt || exit 1; + fi + + $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring_kaldi/penalty_$wip/log/score.LMWT.log \ + cat $dir/scoring_kaldi/penalty_$wip/LMWT.txt \| \ + compute-wer --text --mode=present \ + "ark:cat $dir/scoring_kaldi/test_filt.txt |" ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; + + done +fi + + + +if [ $stage -le 1 ]; then + + for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do + for lmwt in $(seq $min_lmwt $max_lmwt); do + # adding /dev/null to the command list below forces grep to output the filename + grep WER $dir/wer_${lmwt}_${wip} /dev/null + done + done | utils/best_wer.sh >& $dir/scoring_kaldi/best_wer || exit 1 + + best_wer_file=$(awk '{print $NF}' $dir/scoring_kaldi/best_wer) + best_wip=$(echo $best_wer_file | awk -F_ '{print $NF}') + best_lmwt=$(echo $best_wer_file | awk -F_ '{N=NF-1; print $N}') + + if [ -z "$best_lmwt" ]; then + echo "$0: we could not get the details of the best WER from the file $dir/wer_*. Probably something went wrong." + exit 1; + fi + + if $stats; then + mkdir -p $dir/scoring_kaldi/wer_details + echo $best_lmwt > $dir/scoring_kaldi/wer_details/lmwt # record best language model weight + echo $best_wip > $dir/scoring_kaldi/wer_details/wip # record best word insertion penalty + + $cmd $dir/scoring_kaldi/log/stats1.log \ + cat $dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \| \ + align-text --special-symbol="'***'" ark:$dir/scoring_kaldi/test_filt.txt ark:- ark,t:- \| \ + utils/scoring/wer_per_utt_details.pl --special-symbol "'***'" \| tee $dir/scoring_kaldi/wer_details/per_utt \|\ + utils/scoring/wer_per_spk_details.pl $data/utt2spk \> $dir/scoring_kaldi/wer_details/per_spk || exit 1; + + $cmd $dir/scoring_kaldi/log/stats2.log \ + cat $dir/scoring_kaldi/wer_details/per_utt \| \ + utils/scoring/wer_ops_details.pl --special-symbol "'***'" \| \ + sort -b -i -k 1,1 -k 4,4rn -k 2,2 -k 3,3 \> $dir/scoring_kaldi/wer_details/ops || exit 1; + + $cmd $dir/scoring_kaldi/log/wer_bootci.log \ + compute-wer-bootci --mode=present \ + ark:$dir/scoring_kaldi/test_filt.txt ark:$dir/scoring_kaldi/penalty_$best_wip/$best_lmwt.txt \ + '>' $dir/scoring_kaldi/wer_details/wer_bootci || exit 1; + + fi +fi + +steps/scoring/score_kaldi_cer.sh --cmd "$cmd" --stage 2 --min-lmwt $min_lmwt \ + --max-lmwt $max_lmwt --word-ins-penalty $word_ins_penalty \ + $data $lang_or_graph $dir + +# If we got here, the scoring was successful. +# As a small aid to prevent confusion, we remove all wer_{?,??} files; +# these originate from the previous version of the scoring files +# i keep both statement here because it could lead to confusion about +# the capabilities of the script (we don't do cer in the script) +rm $dir/wer_{?,??} 2>/dev/null +rm $dir/cer_{?,??} 2>/dev/null + +exit 0; diff --git a/egs/iam/v2/local/train_lm.sh b/egs/iam/v2/local/train_lm.sh new file mode 100755 index 00000000000..35eb56b1341 --- /dev/null +++ b/egs/iam/v2/local/train_lm.sh @@ -0,0 +1,156 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# Apache 2.0 +# +# This script trains an LM on the LOB+Brown text data and IAM training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +vocab_size=50000 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data + + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using LOB and brown corpus. + if [ ! -f data/local/lob-train-only.txt ]; then + cat data/local/lobcorpus/0167/download/LOB_COCOA/lob.txt | \ + local/remove_test_utterances_from_lob.py data/test/text.old data/val/text.old \ + > data/local/lob-train-only.txt + fi + cat data/local/lob-train-only.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/lob.txt + cat data/local/browncorpus/brown.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/brown.txt + if [ -d "data/local/wellingtoncorpus" ]; then + cat data/local/wellingtoncorpus/Wellington_annotation_removed.txt | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/data/text/wellington.txt + fi + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + + cat data/val/text | cut -d " " -f 2- > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/iam.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/test/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from IAM text + if [ -d "data/local/wellingtoncorpus" ]; then + cat ${dir}/data/text/{iam,lob,brown,wellington}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + else + echo "$0: Wellington Corpus not found. Proceeding without using that corpus." + cat ${dir}/data/text/{iam,lob,brown}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + fi + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=6 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='brown=2 lob=2 iam=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/iam/v2/local/wer_output_filter b/egs/iam/v2/local/wer_output_filter new file mode 100755 index 00000000000..24691a160a9 --- /dev/null +++ b/egs/iam/v2/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/iam/v2/path.sh b/egs/iam/v2/path.sh new file mode 100755 index 00000000000..7e458144624 --- /dev/null +++ b/egs/iam/v2/path.sh @@ -0,0 +1,9 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh + +export LD_LIBRARY_PATH=$KALDI_ROOT/tools/openfst/lib:$LD_LIBRARY_PATH +export LD_LIBRARY_PATH=/home/dpovey/libs:$LD_LIBRARY_PATH +export LC_ALL=C diff --git a/egs/iam/v2/run_end2end.sh b/egs/iam/v2/run_end2end.sh new file mode 100755 index 00000000000..de5c7086ec2 --- /dev/null +++ b/egs/iam/v2/run_end2end.sh @@ -0,0 +1,104 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +set -e +stage=0 +nj=20 +username= +password= +# iam_database points to the database path on the JHU grid. If you have not +# already downloaded the database you can set it to a local directory +# like "data/download" and follow the instructions +# in "local/prepare_data.sh" to download the database: +iam_database=/export/corpora5/handwriting_ocr/IAM +# wellington_database points to the database path on the JHU grid. The Wellington +# corpus contains two directories WWC and WSC (Wellington Written and Spoken Corpus). +# This corpus is of written NZ English that can be purchased here: +# "https://www.victoria.ac.nz/lals/resources/corpora-default" +wellington_database=/export/corpora5/Wellington/WWC/ + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --download-dir "$iam_database" \ + --wellington-dir "$wellington_database" \ + --username "$username" --password "$password" +fi +mkdir -p data/{train,test}/data + +if [ $stage -le 1 ]; then + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$0: Preparing the test and train feature files..." + for dataset in train test; do + local/make_features.py data/$dataset --feat-dim 40 | \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:data/$dataset/data/images.ark,data/$dataset/feats.scp + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + cut -d' ' -f2- data/train/text | \ + local/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + local/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh +fi diff --git a/egs/iam/v2/steps b/egs/iam/v2/steps new file mode 120000 index 00000000000..6e99bf5b5ad --- /dev/null +++ b/egs/iam/v2/steps @@ -0,0 +1 @@ +../../wsj/s5/steps \ No newline at end of file diff --git a/egs/iam/v2/utils b/egs/iam/v2/utils new file mode 120000 index 00000000000..b240885218f --- /dev/null +++ b/egs/iam/v2/utils @@ -0,0 +1 @@ +../../wsj/s5/utils \ No newline at end of file diff --git a/egs/librispeech/s5/local/download_and_untar.sh b/egs/librispeech/s5/local/download_and_untar.sh index d01e681fed7..1bb6d909edc 100755 --- a/egs/librispeech/s5/local/download_and_untar.sh +++ b/egs/librispeech/s5/local/download_and_untar.sh @@ -67,7 +67,9 @@ if [ -f $data/$part.tar.gz ]; then fi fi -if [ ! -f $data/$part.tar.gz ]; then +pushd $data + +if [ ! -f $part.tar.gz ]; then if ! which wget >/dev/null; then echo "$0: wget is not installed." exit 1; @@ -75,20 +77,19 @@ if [ ! -f $data/$part.tar.gz ]; then full_url=$url/$part.tar.gz echo "$0: downloading data from $full_url. This may take some time, please be patient." - cd $data if ! wget --no-check-certificate $full_url; then echo "$0: error executing wget $full_url" exit 1; fi fi -cd $data - if ! tar -xvzf $part.tar.gz; then echo "$0: error un-tarring archive $data/$part.tar.gz" exit 1; fi +popd >&/dev/null + touch $data/LibriSpeech/$part/.complete echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" diff --git a/egs/madcat_ar/v1/run_end2end.sh b/egs/madcat_ar/v1/run_end2end.sh index db9e78a2eac..5d27476d3e1 100755 --- a/egs/madcat_ar/v1/run_end2end.sh +++ b/egs/madcat_ar/v1/run_end2end.sh @@ -116,7 +116,8 @@ if [ $stage -le 8 ]; then echo "$0: Aligning the training data using the e2e chain model..." echo "Date: $(date)." steps/nnet3/align.sh --nj $nj --cmd "$cmd" \ - --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train fi diff --git a/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh new file mode 100755 index 00000000000..0b86ace2de1 --- /dev/null +++ b/egs/mini_librispeech/s5/local/chain/tuning/run_cnn_tdnn_1a.sh @@ -0,0 +1,307 @@ +#!/bin/bash + +# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers +# near the beginning. + +# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/cnn_tdnn1a_sp +# System tdnn1h_sp cnn_tdnn1a_sp +#WER dev_clean_2 (tgsmall) 12.09 11.15 +# [online:] 12.11 11.17 +#WER dev_clean_2 (tglarge) 8.59 7.79 +# [online:] 8.76 7.80 +# Final train prob -0.0493 -0.0467 +# Final valid prob -0.0805 -0.0789 +# Final train prob (xent) -1.1730 -1.0767 +# Final valid prob (xent) -1.3872 -1.3070 +# Num-params 5207856 4492816 + +# Set -e here so that we catch if any executable fails immediately +set -euo pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +decode_nj=10 +train_set=train_clean_5 +test_sets=dev_clean_2 +gmm=tri3b +nnet3_affix= + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +affix=1a # affix for the TDNN directory name +tree_affix= +train_stage=-10 +get_egs_stage=-10 +decode_iter= + +# training options +# training chunk-options +chunk_width=140,100,160 +dropout_schedule='0,0@0.20,0.3@0.50,0' +common_egs_dir= +xent_regularize=0.1 + +# training options +srand=0 +remove_egs=true +reporting_email= + +#decode options +test_online_decoding=true # if true, it will run the last decoding stage. + + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 11 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ + data/lang $gmm_dir $lat_dir + rm $lat_dir/fsts.*.gz # save space +fi + +if [ $stage -le 12 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 13 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + + cnn_opts="l2-regularize=0.03" + ivector_affine_opts="l2-regularize=0.03" + tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" + tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" + tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" + linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" + prefinal_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.015" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=100 name=ivector + input dim=40 name=input + + # this takes the MFCCs and generates filterbank coefficients. The MFCCs + # are more compressible so we prefer to dump the MFCCs to disk rather + # than filterbanks. + idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat + + linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) + batchnorm-component name=ivector-batchnorm target-rms=0.025 + + batchnorm-component name=idct-batchnorm input=idct + combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 + + conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 + conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 + conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 + conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 + + # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the + # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to + # limit the num-parameters). + tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 + tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 + linear-component name=prefinal-l dim=192 $linear_opts + + ## adding the layers for chain branch + prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output include-log-softmax=false dim=$num_targets $output_opts + + # adding the layers for xent branch + prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 14 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$decode_cmd" \ + --feat.online-ivector-dir=$train_ivector_dir \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.0 \ + --chain.apply-deriv-weights=false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.add-option="--optimization.memory-compression-level=2" \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=20 \ + --trainer.frames-per-iter=3000000 \ + --trainer.optimization.num-jobs-initial=2 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.optimization.initial-effective-lrate=0.002 \ + --trainer.optimization.final-effective-lrate=0.0002 \ + --trainer.num-chunk-per-minibatch=128,64 \ + --egs.chunk-width=$chunk_width \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 15 ]; then + # Note: it's not important to give mkgraph.sh the lang directory with the + # matched topology (since it gets the topology file from the model). + utils/mkgraph.sh \ + --self-loop-scale 1.0 data/lang_test_tgsmall \ + $tree_dir $tree_dir/graph_tgsmall || exit 1; +fi + +if [ $stage -le 16 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + rm $dir/.error 2>/dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l /dev/null || true + + for data in $test_sets; do + ( + nspk=$(wc -l 22. Better, on average. -# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp exp/chain/cnn_tdnn1a24_sp exp/chain/cnn_tdnn1a24b_sp -# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp cnn_tdnn1a24_sp cnn_tdnn1a24b_sp -#WER dev_clean_2 (tgsmall) 13.18 13.04 12.15 12.11 11.95 11.86 -# [online:] 13.03 12.97 12.18 12.07 11.99 11.96 -#WER dev_clean_2 (tglarge) 9.18 9.16 8.57 8.47 8.57 8.54 -# [online:] 9.29 9.24 8.64 8.50 8.63 8.57 -# Final train prob -0.0531 -0.0590 -0.0456 -0.0462 -0.0461 -0.0455 -# Final valid prob -0.0844 -0.0865 -0.0800 -0.0802 -0.0800 -0.0798 -# Final train prob (xent) -1.5244 -1.7771 -1.0691 -1.0683 -1.0776 -1.0781 -# Final valid prob (xent) -1.7447 -1.9611 -1.3190 -1.3108 -1.3131 -1.3190 -# Num-params 3512112 3512112 4474688 4474688 4474688 4474688 - -# 1a23 is as 1a14 but for the last cnn layer (cnn5), using twice the num-filters -# plus subsampling on the output. -# A bit better, on average! -# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a14_sp exp/chain/cnn_tdnn1a14b_sp exp/chain/cnn_tdnn1a23_sp exp/chain/cnn_tdnn1a23b_sp -# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a14_sp cnn_tdnn1a14b_sp cnn_tdnn1a23_sp cnn_tdnn1a23b_sp -#WER dev_clean_2 (tgsmall) 13.18 13.04 12.14 12.39 12.15 12.11 -# [online:] 13.03 12.97 12.10 12.38 12.18 12.07 -#WER dev_clean_2 (tglarge) 9.18 9.16 8.44 8.69 8.57 8.47 -# [online:] 9.29 9.24 8.58 8.81 8.64 8.50 -# Final train prob -0.0531 -0.0590 -0.0455 -0.0460 -0.0456 -0.0462 -# Final valid prob -0.0844 -0.0865 -0.0806 -0.0802 -0.0800 -0.0802 -# Final train prob (xent) -1.5244 -1.7771 -1.0792 -1.0763 -1.0691 -1.0683 -# Final valid prob (xent) -1.7447 -1.9611 -1.3221 -1.3173 -1.3190 -1.3108 -# Num-params 3512112 3512112 4456224 4456224 4474688 4474688 - -# 1a14 is as 1a13 but with an extra tdnn-f layer. Better! -# local/chain/compare_wer.sh --online exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp exp/chain/cnn_tdnn1a13_sp exp/chain/cnn_tdnn1a14_sp -# System tdnn1h_sp tdnn1h2_sp cnn_tdnn1a13_sp cnn_tdnn1a14_sp -#WER dev_clean_2 (tgsmall) 13.18 13.04 12.21 12.14 -# [online:] 13.03 12.97 12.26 12.10 -#WER dev_clean_2 (tglarge) 9.18 9.16 8.65 8.44 -# [online:] 9.29 9.24 8.67 8.58 -# Final train prob -0.0531 -0.0590 -0.0459 -0.0455 -# Final valid prob -0.0844 -0.0865 -0.0810 -0.0806 -# Final train prob (xent) -1.5244 -1.7771 -1.0901 -1.0792 -# Final valid prob (xent) -1.7447 -1.9611 -1.3328 -1.3221 -# Num-params 3512112 3512112 4160544 4456224 - -# 1a13 is as 1a12 but using the same l2 values for the first layers as for the -# later ones (more l2). -# 1a12 is as 1a11 but making the first TDNN-F layer non-splicing and restoring -# the 640's to 768's. -# 1a11 is as 1a10 but adding some l2 to the CNN layers and to the TDNN layers -# for the ivector training. -# run_cnn_tdnn_1a10.sh is as run_cnn_tdnn_1a.sh but reducing the 768's to 640 -# to make the num-params similar to the tdnn1h experiment (run_cnn_tdnn_1a.sh was overfitting -# a bit). -# -# run_cnn_tdnn_1a.sh is modified from run_tdnn_1h.sh, but adding CNN layers -# near the beginning. - -# 1h is as 1g but a re-tuned model based on resnet-style TDNN-F layers with -# bypass connections. Below, 1h2 is just a rerun of 1h with a different --affix -# option, to give some idea of the run-to-run variation. - -# local/chain/compare_wer.sh --online exp/chain/tdnn1g_sp exp/chain/tdnn1h_sp exp/chain/tdnn1h2_sp -# System tdnn1g_sp tdnn1h_sp tdnn1h2_sp -#WER dev_clean_2 (tgsmall) 13.50 13.18 13.04 -# [online:] 13.52 13.03 12.97 -#WER dev_clean_2 (tglarge) 9.79 9.18 9.16 -# [online:] 9.79 9.29 9.24 -# Final train prob -0.0460 -0.0531 -0.0590 -# Final valid prob -0.0892 -0.0844 -0.0865 -# Final train prob (xent) -1.1739 -1.5244 -1.7771 -# Final valid prob (xent) -1.4487 -1.7447 -1.9611 -# Num-params 6234672 3512112 3512112 - -# steps/info/chain_dir_info.pl exp/chain/tdnn1{g,h,h2}_sp -# exp/chain/tdnn1g_sp: num-iters=25 nj=2..5 num-params=6.2M dim=40+100->2328 combine=-0.056->-0.055 (over 3) xent:train/valid[15,24,final]=(-1.50,-1.23,-1.17/-1.73,-1.52,-1.45) logprob:train/valid[15,24,final]=(-0.063,-0.051,-0.046/-0.101,-0.094,-0.089) -# exp/chain/tdnn1h_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.055->-0.050 (over 4) xent:train/valid[21,33,final]=(-1.97,-1.57,-1.52/-2.11,-1.78,-1.74) logprob:train/valid[21,33,final]=(-0.080,-0.061,-0.053/-0.106,-0.096,-0.084) -# exp/chain/tdnn1h2_sp: num-iters=34 nj=2..5 num-params=3.5M dim=40+100->2328 combine=-0.062->-0.056 (over 4) xent:train/valid[21,33,final]=(-2.21,-1.78,-1.78/-2.34,-1.96,-1.96) logprob:train/valid[21,33,final]=(-0.086,-0.066,-0.059/-0.110,-0.098,-0.087) - -# Set -e here so that we catch if any executable fails immediately -set -euo pipefail - -# First the options that are passed through to run_ivector_common.sh -# (some of which are also used in this script directly). -stage=0 -decode_nj=10 -train_set=train_clean_5 -test_sets=dev_clean_2 -gmm=tri3b -nnet3_affix= - -# The rest are configs specific to this script. Most of the parameters -# are just hardcoded at this level, in the commands below. -affix=1a54 # affix for the TDNN directory name -tree_affix= -train_stage=-10 -get_egs_stage=-10 -decode_iter= - -# training options -# training chunk-options -chunk_width=140,100,160 -dropout_schedule='0,0@0.20,0.3@0.50,0' -common_egs_dir= -xent_regularize=0.1 - -# training options -srand=0 -remove_egs=true -reporting_email= - -#decode options -test_online_decoding=true # if true, it will run the last decoding stage. - - -# End configuration section. -echo "$0 $@" # Print the command line for logging - -. ./cmd.sh -. ./path.sh -. ./utils/parse_options.sh - -if ! cuda-compiled; then - cat <$lang/topo - fi -fi - -if [ $stage -le 11 ]; then - # Get the alignments as lattices (gives the chain training more freedom). - # use the same num-jobs as the alignments - steps/align_fmllr_lats.sh --nj 75 --cmd "$train_cmd" ${lores_train_data_dir} \ - data/lang $gmm_dir $lat_dir - rm $lat_dir/fsts.*.gz # save space -fi - -if [ $stage -le 12 ]; then - # Build a tree using our new topology. We know we have alignments for the - # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use - # those. The num-leaves is always somewhat less than the num-leaves from - # the GMM baseline. - if [ -f $tree_dir/final.mdl ]; then - echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." - exit 1; - fi - steps/nnet3/chain/build_tree.sh \ - --frame-subsampling-factor 3 \ - --context-opts "--context-width=2 --central-position=1" \ - --cmd "$train_cmd" 3500 ${lores_train_data_dir} \ - $lang $ali_dir $tree_dir -fi - - -if [ $stage -le 13 ]; then - mkdir -p $dir - echo "$0: creating neural net configs using the xconfig parser"; - - num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') - learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) - - cnn_opts="l2-regularize=0.03" - ivector_affine_opts="l2-regularize=0.03" - tdnn_opts="l2-regularize=0.03 dropout-proportion=0.0 dropout-per-dim-continuous=true" - tdnnf_first_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.0" - tdnnf_opts="l2-regularize=0.03 dropout-proportion=0.0 bypass-scale=0.66" - linear_opts="l2-regularize=0.03 orthonormal-constraint=-1.0" - prefinal_opts="l2-regularize=0.03" - output_opts="l2-regularize=0.015" - - mkdir -p $dir/configs - cat < $dir/configs/network.xconfig - input dim=100 name=ivector - input dim=40 name=input - - # this takes the MFCCs and generates filterbank coefficients. The MFCCs - # are more compressible so we prefer to dump the MFCCs to disk rather - # than filterbanks. - idct-layer name=idct input=input dim=40 cepstral-lifter=22 affine-transform-file=$dir/configs/idct.mat - - linear-component name=ivector-linear $ivector_affine_opts dim=200 input=ReplaceIndex(ivector, t, 0) - batchnorm-component name=ivector-batchnorm target-rms=0.025 - - batchnorm-component name=idct-batchnorm input=idct - combine-feature-maps-layer name=combine_inputs input=Append(idct-batchnorm, ivector-batchnorm) num-filters1=1 num-filters2=5 height=40 - - conv-relu-batchnorm-layer name=cnn1 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 learning-rate-factor=0.333 max-change=0.25 - conv-relu-batchnorm-layer name=cnn2 $cnn_opts height-in=40 height-out=40 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=48 - conv-relu-batchnorm-layer name=cnn3 $cnn_opts height-in=40 height-out=20 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 - conv-relu-batchnorm-layer name=cnn4 $cnn_opts height-in=20 height-out=20 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 - conv-relu-batchnorm-layer name=cnn5 $cnn_opts height-in=20 height-out=10 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=64 - conv-relu-batchnorm-layer name=cnn6 $cnn_opts height-in=10 height-out=5 height-subsample-out=2 time-offsets=-1,0,1 height-offsets=-1,0,1 num-filters-out=128 - - # the first TDNN-F layer has no bypass (since dims don't match), and a larger bottleneck so the - # information bottleneck doesn't become a problem. (we use time-stride=0 so no splicing, to - # limit the num-parameters). - tdnnf-layer name=tdnnf7 $tdnnf_first_opts dim=768 bottleneck-dim=192 time-stride=0 - tdnnf-layer name=tdnnf8 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf9 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf10 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf11 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf12 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf13 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf14 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - tdnnf-layer name=tdnnf15 $tdnnf_opts dim=768 bottleneck-dim=96 time-stride=3 - linear-component name=prefinal-l dim=192 $linear_opts - - ## adding the layers for chain branch - prefinal-layer name=prefinal-chain input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 - output-layer name=output include-log-softmax=false dim=$num_targets $output_opts - - # adding the layers for xent branch - prefinal-layer name=prefinal-xent input=prefinal-l $prefinal_opts small-dim=192 big-dim=768 - output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor $output_opts -EOF - steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ -fi - - -if [ $stage -le 14 ]; then - if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then - utils/create_split_dir.pl \ - /export/fs0{1,2}/$USER/kaldi-data/egs/mini_librispeech-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage - fi - - steps/nnet3/chain/train.py --stage=$train_stage \ - --cmd="$decode_cmd" \ - --feat.online-ivector-dir=$train_ivector_dir \ - --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ - --chain.xent-regularize $xent_regularize \ - --chain.leaky-hmm-coefficient=0.1 \ - --chain.l2-regularize=0.0 \ - --chain.apply-deriv-weights=false \ - --chain.lm-opts="--num-extra-lm-states=2000" \ - --trainer.dropout-schedule $dropout_schedule \ - --trainer.add-option="--optimization.memory-compression-level=2" \ - --trainer.srand=$srand \ - --trainer.max-param-change=2.0 \ - --trainer.num-epochs=20 \ - --trainer.frames-per-iter=3000000 \ - --trainer.optimization.num-jobs-initial=2 \ - --trainer.optimization.num-jobs-final=5 \ - --trainer.optimization.initial-effective-lrate=0.002 \ - --trainer.optimization.final-effective-lrate=0.0002 \ - --trainer.num-chunk-per-minibatch=128,64 \ - --egs.chunk-width=$chunk_width \ - --egs.dir="$common_egs_dir" \ - --egs.opts="--frames-overlap-per-eg 0" \ - --cleanup.remove-egs=$remove_egs \ - --use-gpu=true \ - --reporting.email="$reporting_email" \ - --feat-dir=$train_data_dir \ - --tree-dir=$tree_dir \ - --lat-dir=$lat_dir \ - --dir=$dir || exit 1; -fi - -if [ $stage -le 15 ]; then - # Note: it's not important to give mkgraph.sh the lang directory with the - # matched topology (since it gets the topology file from the model). - utils/mkgraph.sh \ - --self-loop-scale 1.0 data/lang_test_tgsmall \ - $tree_dir $tree_dir/graph_tgsmall || exit 1; -fi - -if [ $stage -le 16 ]; then - frames_per_chunk=$(echo $chunk_width | cut -d, -f1) - rm $dir/.error 2>/dev/null || true - - for data in $test_sets; do - ( - nspk=$(wc -l /dev/null || true - - for data in $test_sets; do - ( - nspk=$(wc -l 2854 combine=-0.042->-0.042 (over 2) xent:train/valid[71,107,final]=(-0.975,-0.640,-0.646/-0.980,-0.678,-0.688) logprob:train/valid[71,107,final]=(-0.067,-0.043,-0.042/-0.069,-0.050,-0.049) -# exp/chain/cnn_tdnn1b17_sp: num-iters=144 nj=2..8 num-params=6.9M dim=40+100->2854 combine=-0.041->-0.041 (over 3) xent:train/valid[95,143,final]=(-0.866,-0.617,-0.620/-0.881,-0.657,-0.659) logprob:train/valid[95,143,final]=(-0.061,-0.042,-0.041/-0.062,-0.050,-0.049) - -# The following table compares chain (TDNN+LSTM, TDNN, CNN+TDNN). -# The CNN+TDNN doesn't seem to have any advantages versus the TDNN (and it's -# about 5 times slower per iteration). But it's not well tuned. -# And the num-params is fewer (5.5M vs 7.6M for TDNN). - -# local/chain/compare_wer.sh exp/chain/tdnn_lstm1a_sp exp/chain/tdnn1a_sp exp/chain/cnn_tdnn1a_sp -# System tdnn_lstm1a_sp tdnn1a_sp cnn_tdnn1a_sp -#WER dev93 (tgpr) 7.48 7.87 9.02 -#WER dev93 (tg) 7.41 7.61 8.60 -#WER dev93 (big-dict,tgpr) 5.64 5.71 6.97 -#WER dev93 (big-dict,fg) 5.40 5.10 6.12 -#WER eval92 (tgpr) 5.67 5.23 5.56 -#WER eval92 (tg) 5.46 4.87 5.05 -#WER eval92 (big-dict,tgpr) 3.69 3.24 3.40 -#WER eval92 (big-dict,fg) 3.28 2.71 2.73 -# Final train prob -0.0341 -0.0414 -0.0532 -# Final valid prob -0.0506 -0.0634 -0.0752 -# Final train prob (xent) -0.5643 -0.8216 -1.0857 -# Final valid prob (xent) -0.6648 -0.9208 -1.1505 - - set -e -o pipefail @@ -128,7 +34,7 @@ num_threads_ubm=32 nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. # Options which are not passed through to run_ivector_common.sh -affix=1b17 #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +affix=1c #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. common_egs_dir= reporting_email= diff --git a/egs/wsj/s5/steps/align_basis_fmllr.sh b/egs/wsj/s5/steps/align_basis_fmllr.sh index 417fbc96c8a..e5510c5ab7e 100755 --- a/egs/wsj/s5/steps/align_basis_fmllr.sh +++ b/egs/wsj/s5/steps/align_basis_fmllr.sh @@ -20,6 +20,7 @@ cmd=run.pl use_graphs=false # Begin configuration. scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" +basis_fmllr_opts="--fmllr-min-count=22 --num-iters=10 --size-scale=0.2 --step-size-iters=3" beam=10 retry_beam=40 boost_silence=1.5 # factor by which to boost silence during alignment. @@ -136,22 +137,20 @@ if [ $stage -le 2 ]; then ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ - gmm-est-basis-fmllr-gpost --fmllr-min-count=22 --num-iters=10 \ - --size-scale=0.2 --step-size-iters=3 \ - --write-weights=ark:$dir/pre_wgt.JOB \ + gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \ $mdl $srcdir/fmllr.basis "$sifeats" ark,s,cs:- \ ark:$dir/trans.JOB || exit 1; -# else -# $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ -# ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ -# weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ -# gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ -# --spk2utt=ark:$sdata/JOB/spk2utt $mdl "$sifeats" \ -# ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-basis-fmllr $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \ + $mdl $srcdir/fmllr.basis "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; fi fi -feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |" +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" if [ $stage -le 3 ]; then echo "$0: doing final alignment." diff --git a/egs/wsj/s5/steps/align_basis_fmllr_lats.sh b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh new file mode 100755 index 00000000000..426168496cc --- /dev/null +++ b/egs/wsj/s5/steps/align_basis_fmllr_lats.sh @@ -0,0 +1,184 @@ +#!/bin/bash +# +# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey) +# Apache 2.0 + +# Version of align_fmllr_lats.sh that uses "basis fMLLR", so it is suitable for +# situations where there is very little data per speaker (e.g. when there is a +# one-to-one mapping between utterances and speakers). Intended for use where +# the model was trained with basis-fMLLR (i.e. when you trained the model with +# train_sat_basis.sh where you normally would have trained with train_sat.sh), +# or when it was trained with SAT but you ran get_fmllr_basis.sh on the +# source-model directory. + +# Begin configuration section. +stage=0 +nj=4 +cmd=run.pl +# Begin configuration. +scale_opts="--transition-scale=1.0 --self-loop-scale=0.1" +acoustic_scale=0.1 +beam=10 +retry_beam=40 +final_beam=20 # For the lattice-generation phase there is no retry-beam. This + # is a limitation of gmm-latgen-faster. We just use an + # intermediate beam. We'll lose a little data and it will be + # slightly slower. (however, the min-active of 200 that + # gmm-latgen-faster defaults to may help.) +boost_silence=1.0 # factor by which to boost silence during alignment. +basis_fmllr_opts="--fmllr-min-count=22 --num-iters=10 --size-scale=0.2 --step-size-iters=3" + +generate_ali_from_lats=false # If true, alingments generated from lattices. +# End configuration options. + +echo "$0 $@" # Print the command line for logging + +[ -f path.sh ] && . ./path.sh # source the path. +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "usage: steps/align_fmllr_lats.sh " + echo "e.g.: steps/align_fmllr_lats.sh data/train data/lang exp/tri1 exp/tri1_lats" + echo "main options (for others, see top of script file)" + echo " --config # config containing options" + echo " --nj # number of parallel jobs" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + exit 1; +fi + +data=$1 +lang=$2 +srcdir=$3 +dir=$4 + +if [ ! -f $srcdir/fmllr.basis ]; then + echo "$0: expected $srcdir/fmllr.basis to exist. Run get_fmllr_basis.sh on $srcdir." +fi + +for f in $data/feats.scp $lang/phones.txt $srcdir/final.mdl; do + [ ! -f $f ] && echo "$0: expected file $f to exist" && exit 1 +done + + +oov=`cat $lang/oov.int` || exit 1; +silphonelist=`cat $lang/phones/silence.csl` || exit 1; +sdata=$data/split$nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +[[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; + +utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt || exit 1; +cp $lang/phones.txt $dir || exit 1; + +cp $srcdir/{tree,final.mdl} $dir || exit 1; +cp $srcdir/final.alimdl $dir 2>/dev/null +cp $srcdir/final.occs $dir; +splice_opts=`cat $srcdir/splice_opts 2>/dev/null` # frame-splicing options. +cp $srcdir/splice_opts $dir 2>/dev/null # frame-splicing options. +cmvn_opts=`cat $srcdir/cmvn_opts 2>/dev/null` +cp $srcdir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. +delta_opts=`cat $srcdir/delta_opts 2>/dev/null` +cp $srcdir/delta_opts $dir 2>/dev/null + +if [ -f $srcdir/final.mat ]; then feat_type=lda; else feat_type=delta; fi +echo "$0: feature type is $feat_type" + +case $feat_type in + delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; + lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" + cp $srcdir/final.mat $dir + cp $srcdir/full.mat $dir 2>/dev/null + ;; + *) echo "Invalid feature type $feat_type" && exit 1; +esac + +## Set up model and alignment model. +mdl=$srcdir/final.mdl +if [ -f $srcdir/final.alimdl ]; then + alimdl=$srcdir/final.alimdl +else + alimdl=$srcdir/final.mdl +fi +[ ! -f $mdl ] && echo "$0: no such model $mdl" && exit 1; +alimdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $alimdl - |" +mdl_cmd="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $mdl - |" + + +## because gmm-latgen-faster doesn't support adding the transition-probs to the +## graph itself, we need to bake them into the compiled graphs. This means we can't reuse previously compiled graphs, +## because the other scripts write them without transition probs. +if [ $stage -le 0 ]; then + echo "$0: compiling training graphs" + tra="ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt $sdata/JOB/text|"; + $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ + compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $scale_opts $dir/tree $dir/final.mdl $lang/L.fst "$tra" \ + "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; +fi + + +if [ $stage -le 1 ]; then + # Note: we need to set --transition-scale=0.0 --self-loop-scale=0.0 because, + # as explained above, we compiled the transition probs into the training + # graphs. + echo "$0: aligning data in $data using $alimdl and speaker-independent features." + $cmd JOB=1:$nj $dir/log/align_pass1.JOB.log \ + gmm-align-compiled --transition-scale=0.0 --self-loop-scale=0.0 --acoustic-scale=$acoustic_scale \ + --beam=$beam --retry-beam=$retry_beam "$alimdl_cmd" \ + "ark:gunzip -c $dir/fsts.JOB.gz|" "$sifeats" "ark:|gzip -c >$dir/pre_ali.JOB.gz" || exit 1; +fi + +if [ $stage -le 2 ]; then + echo "$0: computing fMLLR transforms" + if [ "$alimdl" != "$mdl" ]; then + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-post-to-gpost $alimdl "$sifeats" ark:- ark:- \| \ + gmm-est-basis-fmllr-gpost $basis_fmllr_opts \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + else + $cmd JOB=1:$nj $dir/log/fmllr.JOB.log \ + ali-to-post "ark:gunzip -c $dir/pre_ali.JOB.gz|" ark:- \| \ + weight-silence-post 0.0 $silphonelist $alimdl ark:- ark:- \| \ + gmm-est-basis-fmllr $basis_fmllr_opts \ + --spk2utt=ark:$sdata/JOB/spk2utt $mdl $srcdir/fmllr.basis "$sifeats" \ + ark,s,cs:- ark:$dir/trans.JOB || exit 1; + fi +fi + +feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" + +if [ $stage -le 3 ]; then + # Warning: gmm-latgen-faster doesn't support a retry-beam so you may get more + # alignment errors (however, it does have a default min-active=200 so this + # will tend to reduce alignment errors). + # --allow_partial=false makes sure we reach the end of the decoding graph. + # --word-determinize=false makes sure we retain the alternative pronunciations of + # words (including alternatives regarding optional silences). + # --lattice-beam=$beam keeps all the alternatives that were within the beam, + # it means we do no pruning of the lattice (lattices from a training transcription + # will be small anyway). + echo "$0: generating lattices containing alternate pronunciations." + $cmd JOB=1:$nj $dir/log/generate_lattices.JOB.log \ + gmm-latgen-faster --acoustic-scale=$acoustic_scale --beam=$final_beam \ + --lattice-beam=$final_beam --allow-partial=false --word-determinize=false \ + "$mdl_cmd" "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ + "ark:|gzip -c >$dir/lat.JOB.gz" || exit 1; +fi + +if [ $stage -le 4 ] && $generate_ali_from_lats; then + # If generate_alignments is true, ali.*.gz is generated in lats dir + $cmd JOB=1:$nj $dir/log/generate_alignments.JOB.log \ + lattice-best-path --acoustic-scale=$acoustic_scale "ark:gunzip -c $dir/lat.JOB.gz |" \ + ark:/dev/null "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; +fi + +rm $dir/pre_ali.*.gz 2>/dev/null || true + +echo "$0: done generating lattices from training transcripts." + +utils/summarize_warnings.pl $dir/log + +exit 0; diff --git a/egs/wsj/s5/steps/align_fmllr_lats.sh b/egs/wsj/s5/steps/align_fmllr_lats.sh index 187d9bf5687..b47b97ef994 100755 --- a/egs/wsj/s5/steps/align_fmllr_lats.sh +++ b/egs/wsj/s5/steps/align_fmllr_lats.sh @@ -5,7 +5,7 @@ # Version of align_fmllr.sh that generates lattices (lat.*.gz) with # alignments of alternative pronunciations in them. Mainly intended -# as a precursor to CTC training for now. +# as a precursor to LF-MMI/chain training for now. # Begin configuration section. stage=0 diff --git a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh index de99fd8e624..d1297ccd836 100755 --- a/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh +++ b/egs/wsj/s5/steps/cleanup/decode_fmllr_segmentation.sh @@ -111,7 +111,7 @@ delta_opts=`cat $srcdir/delta_opts 2>/dev/null` || true silphonelist=`cat $graphdir/phones/silence.csl` || exit 1; -utils/lang/check_phones_compatible.sh $graph_dir/phones.txt $srcdir/phones.txt +utils/lang/check_phones_compatible.sh $graphdir/phones.txt $srcdir/phones.txt # Some checks. Note: we don't need $srcdir/tree but we expect # it should exist, given the current structure of the scripts. diff --git a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh index b641cd18cbb..ff0a87ae295 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh +++ b/egs/wsj/s5/steps/diagnostic/analyze_alignments.sh @@ -44,7 +44,7 @@ $cmd JOB=1:$num_jobs $dir/log/get_phone_alignments.JOB.log \ set -o pipefail '&&' ali-to-phones --write-lengths=true "$model" \ "ark:gunzip -c $dir/ali.JOB.gz|" ark,t:- \| \ sed -E 's/^[^ ]+ //' \| \ - awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ + awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 if ! $cmd $dir/log/analyze_alignments.log \ diff --git a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh index 98b33d9d09d..d580f516527 100755 --- a/egs/wsj/s5/steps/diagnostic/analyze_lats.sh +++ b/egs/wsj/s5/steps/diagnostic/analyze_lats.sh @@ -51,7 +51,7 @@ $cmd JOB=1:$num_jobs $dir/log/lattice_best_path.JOB.log \ $cmd JOB=1:$num_jobs $dir/log/get_lattice_stats.JOB.log \ ali-to-phones --write-lengths=true "$model" "ark:gunzip -c $dir/ali_tmp.JOB.gz|" ark,t:- \| \ sed -E 's/^[^ ]+ //' \| \ - awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ + awk 'BEGIN{FS=" ; "; OFS="\n";} {print "begin " $1; if (NF>1) print "end " $NF; for (n=1;n<=NF;n++) print "all " $n; }' \| \ sort \| uniq -c \| gzip -c '>' $dir/phone_stats.JOB.gz || exit 1 diff --git a/egs/wsj/s5/steps/libs/common.py b/egs/wsj/s5/steps/libs/common.py index 1e8e2ced6ce..503721c23d1 100644 --- a/egs/wsj/s5/steps/libs/common.py +++ b/egs/wsj/s5/steps/libs/common.py @@ -18,6 +18,11 @@ import sys import threading +try: + import thread as thread_module +except: + import _thread as thread_module + logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) @@ -230,8 +235,7 @@ def background_command_waiter(command, popen_object, require_zero_status): logger.error(str) # thread.interrupt_main() sends a KeyboardInterrupt to the main # thread, which will generally terminate the program. - import thread - thread.interrupt_main() + thread_module.interrupt_main() else: logger.warning(str) diff --git a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py index 73f4e5b6533..1afc26ff163 100755 --- a/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py +++ b/egs/wsj/s5/steps/libs/nnet3/report/log_parse.py @@ -532,7 +532,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): try: report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2]-x[1])) - except KeyError, IndexError: + except (KeyError, IndexError): continue total_time = 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 229f290e94c..6afb43824fd 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -167,7 +167,7 @@ def train_new_models(dir, iter, srand, num_jobs, # work out the 1-based archive index. archive_index = (k % num_archives) + 1 # previous : frame_shift = (k/num_archives) % frame_subsampling_factor - frame_shift = ((archive_index + k/num_archives) + frame_shift = ((archive_index + k//num_archives) % frame_subsampling_factor) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 720164e5436..d052c78b3f8 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -288,7 +288,7 @@ def halve_range_str(range_str): halved_ranges = [] for r in ranges: # a range may be either e.g. '64', or '128:256' - c = [str(max(1, int(x)/2)) for x in r.split(":")] + c = [str(max(1, int(x)//2)) for x in r.split(":")] halved_ranges.append(":".join(c)) return ','.join(halved_ranges) @@ -591,7 +591,7 @@ def get_model_combine_iters(num_iters, num_epochs, models_to_combine.add(num_iters) else: subsample_model_factor = 1 - num_iters_combine = min(max_models_combine, num_iters/2) + num_iters_combine = min(max_models_combine, num_iters//2) models_to_combine = set(range(num_iters - num_iters_combine + 1, num_iters + 1)) diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py index e95de336586..9a856bc6fe1 100644 --- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py @@ -748,7 +748,8 @@ def check_configs(self): if self.config['target-rms'] < 0.0: raise RuntimeError("target-rms has invalid value {0}" .format(self.config['target-rms'])) - if self.config['learning-rate-factor'] <= 0.0: + if (self.config['learning-rate-factor'] != '' and + self.config['learning-rate-factor'] <= 0.0): raise RuntimeError("learning-rate-factor has invalid value {0}" .format(self.config['learning-rate-factor'])) diff --git a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh index 1dbcbe1a192..049e15df303 100755 --- a/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh +++ b/egs/wsj/s5/steps/lmrescore_rnnlm_lat.sh @@ -57,12 +57,9 @@ fi oldlm=$oldlang/G.fst if [ -f $oldlang/G.carpa ]; then oldlm=$oldlang/G.carpa -elif [ ! -f $oldlm ]; then - echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" &&\ - exit 1; fi -[ ! -f $oldlm ] && echo "$0: Missing file $oldlm" && exit 1; +[ ! -f $oldlm ] && echo "$0: expecting either $oldlang/G.fst or $oldlang/G.carpa to exist" && exit 1; [ ! -f $rnnlm_dir/rnnlm ] && echo "$0: Missing file $rnnlm_dir/rnnlm" && exit 1; [ ! -f $rnnlm_dir/unk.probs ] &&\ echo "$0: Missing file $rnnlm_dir/unk.probs" && exit 1; diff --git a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh index 2fcc4a1944d..dd0eeeddddd 100755 --- a/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh +++ b/egs/wsj/s5/steps/nnet3/decode_score_fusion.sh @@ -76,8 +76,8 @@ write_compact=true # If set to false, then writes the lattice in non-compact f if [ $# -lt 5 ]; then echo "Usage: $0 [options] [ ... ] " - echo "e.g.: local/socal/score_fusion.sh --nj 8 \\" - echo "--online-ivector-dir exp/nnet3/ivectors_test_eval92 \\" + echo "e.g.: steps/nnet3/decode_score_fusion.sh --nj 8 \\" + echo " --online-ivector-dir exp/nnet3/ivectors_test_eval92 \\" echo " data/test_eval92_hires exp/nnet3/tdnn/graph exp/nnet3/tdnn/output exp/nnet3/tdnn1/output .. \\" echo " exp/nnet3/tdnn_comb/decode_dev" echo "main options (for others, see top of script file)" @@ -116,9 +116,6 @@ if [ $frame_subsampling_factor -ne 1 ]; then frame_subsampling_opt="--frame-subsampling-factor=$frame_subsampling_factor" fi -# convert $dir to absolute pathname -fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $dir ${PWD}` - # Possibly use multi-threaded decoder thread_string= [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" diff --git a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh index 2e368283ed4..784693ee44c 100755 --- a/egs/wsj/s5/steps/nnet3/get_egs_targets.sh +++ b/egs/wsj/s5/steps/nnet3/get_egs_targets.sh @@ -130,8 +130,8 @@ if ! [ $num_utts -gt $[$num_utts_subset_valid*4] ]; then fi # Get list of validation utterances. -awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset_valid | sort \ - > $dir/valid_uttlist || exit 1; +awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_valid | sort \ + > $dir/valid_uttlist if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" @@ -145,7 +145,7 @@ if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. fi awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $dir/valid_uttlist | \ - utils/shuffle_list.pl | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist || exit 1; + utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset_train | sort > $dir/train_subset_uttlist ## Set up features. echo "$0: feature type is raw" diff --git a/egs/wsj/s5/steps/train_sat_basis.sh b/egs/wsj/s5/steps/train_sat_basis.sh index 45384fe4ecd..5245ea0c619 100755 --- a/egs/wsj/s5/steps/train_sat_basis.sh +++ b/egs/wsj/s5/steps/train_sat_basis.sh @@ -17,6 +17,7 @@ scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" beam=10 retry_beam=40 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment +basis_fmllr_opts="--fmllr-min-count=22 --num-iters=10 --size-scale=0.2 --step-size-iters=3" context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. realign_iters="10 20 30"; fmllr_iters="2 4 6 12"; @@ -93,7 +94,7 @@ esac ## Get initial fMLLR transforms (possibly from alignment dir) if [ -f $alidir/trans.1 ]; then echo "$0: Using transforms from $alidir" - feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" cur_trans_dir=$alidir else if [ $stage -le -5 ]; then @@ -114,13 +115,11 @@ else ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \ gmm-post-to-gpost $alidir/final.mdl "$sifeats" ark:- ark:- \| \ - gmm-est-basis-fmllr-gpost --fmllr-min-count=22 --num-iters=10 \ - --size-scale=0.2 --step-size-iters=3 \ - --write-weights=ark:$dir/pre_wgt.JOB \ + gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \ $alidir/final.mdl $alidir/fmllr.basis "$sifeats" ark,s,cs:- \ ark:$alidir/trans.JOB || exit 1; - feats="$sifeats transform-feats ark,s,cs:$alidir/trans.JOB ark:- ark:- |" + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" cur_trans_dir=$alidir fi fi @@ -214,14 +213,12 @@ while [ $x -lt $num_iters ]; do ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \ gmm-post-to-gpost $dir/$x.mdl "$sifeats" ark:- ark:- \| \ - gmm-est-basis-fmllr-gpost --fmllr-min-count=22 --num-iters=10 \ - --size-scale=0.2 --step-size-iters=3 \ - --write-weights=ark:$dir/pre_wgt.JOB \ + gmm-est-basis-fmllr-gpost $basis_fmllr_opts --spk2utt=ark:$sdata/JOB/spk2utt \ $dir/$x.mdl $dir/fmllr.basis "$sifeats" ark,s,cs:- \ ark:$dir/trans.JOB || exit 1; fi - feats="$sifeats transform-feats ark:$dir/trans.JOB ark:- ark:- |" + feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" cur_trans_dir=$dir fi diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 103a4173dc0..ca0972ca85b 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -6,6 +6,8 @@ # It puts the original contents of data-dir into # data-dir/.backup +cmd="$@" + utt_extra_files= spk_extra_files= @@ -21,6 +23,12 @@ if [ $# != 1 ]; then fi data=$1 + +if [ -f $data/images.scp ]; then + image/fix_data_dir.sh $cmd + exit $? +fi + mkdir -p $data/.backup [ ! -d $data ] && echo "$0: no such directory $data" && exit 1; diff --git a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py index 81c0df36d2b..f7e0dcbdc5f 100755 --- a/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py +++ b/egs/wsj/s5/utils/lang/limit_arpa_unk_history.py @@ -77,6 +77,10 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows): if "\{}-grams:".format(max_ngrams) in line: last_ngram = True + for i in range(max_ngrams): + if "\{}-grams:".format(i+1) in line: + ngram = i+1 + # remove any n-gram states of the form: foo -> X # that is, any n-grams of order > 2 where # is the second-to-last word @@ -85,7 +89,6 @@ def find_and_replace_unks(old_lm_lines, max_ngrams, skip_rows): if passed_2grams: g_unk = unk_pattern.search(line) if g_unk: - ngram = len(g_unk.group(0).split()) - 1 ngram_diffs[ngram] = ngram_diffs[ngram] - 1 unk_row_count += 1 continue diff --git a/egs/wsj/s5/utils/parallel/retry.pl b/egs/wsj/s5/utils/parallel/retry.pl index a039d6f5a74..618e9fb01bc 100755 --- a/egs/wsj/s5/utils/parallel/retry.pl +++ b/egs/wsj/s5/utils/parallel/retry.pl @@ -94,7 +94,6 @@ sub get_log_file { # Later on we might want to figure out which array jobs failed # and have to be rerun, but for now we just die. print STDERR "$0: job failed and log file $log_file does not exist (array job?).\n"; - exit($return_status) } else { rename($log_file, $log_file . ".bak"); print STDERR "$0: job failed; renaming log file to ${log_file}.bak and rerunning\n"; diff --git a/egs/wsj/s5/utils/subset_data_dir.sh b/egs/wsj/s5/utils/subset_data_dir.sh index ba52d140ccc..93ee0971b88 100755 --- a/egs/wsj/s5/utils/subset_data_dir.sh +++ b/egs/wsj/s5/utils/subset_data_dir.sh @@ -124,8 +124,10 @@ function do_filtering { [ -f $srcdir/reco2file_and_channel ] && \ utils/filter_scp.pl $destdir/reco <$srcdir/reco2file_and_channel >$destdir/reco2file_and_channel - # Filter the STM file for proper sclite scoring (this will also remove the comments lines) - [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm > $destdir/stm + # Filter the STM file for proper sclite scoring + # Copy over the comments from STM file + [ -f $srcdir/stm ] && grep "^;;" $srcdir/stm > $destdir/stm + [ -f $srcdir/stm ] && utils/filter_scp.pl $destdir/reco < $srcdir/stm >> $destdir/stm rm $destdir/reco else diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index 453ad6935f6..a8b0542c1bb 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -1,5 +1,6 @@ #!/bin/bash +cmd="$@" no_feats=false no_wav=false @@ -44,6 +45,12 @@ if [ ! -d $data ]; then exit 1; fi +if [ -f $data/images.scp ]; then + cmd=${cmd/--no-wav/} # remove --no-wav if supplied + image/validate_data_dir.sh $cmd + exit $? +fi + for f in spk2utt utt2spk; do if [ ! -f $data/$f ]; then echo "$0: no such file $f" diff --git a/misc/docker/centos/Dockerfile b/misc/docker/centos/Dockerfile index 304951fa4e0..27fe31c0566 100644 --- a/misc/docker/centos/Dockerfile +++ b/misc/docker/centos/Dockerfile @@ -7,7 +7,7 @@ ENV CPU_CORE 4 RUN yum update -y RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools" "System Tools" RUN yum install -y \ - git bzip2 wget subversion which \ + git bzip2 wget subversion which sox \ gcc-c++ make automake autoconf zlib-devel atlas-static \ python diff --git a/misc/docker/fedora/Dockerfile b/misc/docker/fedora/Dockerfile index 68f2d9504c7..4e30f8e66bf 100644 --- a/misc/docker/fedora/Dockerfile +++ b/misc/docker/fedora/Dockerfile @@ -7,8 +7,8 @@ ENV CPU_CORE 4 RUN yum update -y RUN yum groupinstall -y "C Development Tools and Libraries" "Development Tools" RUN yum install -y \ - git bzip2 wget subversion \ - gcc-c++ make automake autoconf zlib-devel \ + git bzip2 wget subversion sox \ + gcc-c++ make automake autoconf zlib-devel atlas-static \ python python3 diff --git a/misc/docker/ubuntu-cuda/Dockerfile b/misc/docker/ubuntu-cuda/Dockerfile index f61d4403355..f6225ee12ed 100644 --- a/misc/docker/ubuntu-cuda/Dockerfile +++ b/misc/docker/ubuntu-cuda/Dockerfile @@ -7,7 +7,7 @@ ENV CPU_CORE 4 RUN \ apt-get update -qq && \ apt-get install -y \ - git bzip2 wget \ + git bzip2 wget sox \ g++ make python python3 \ zlib1g-dev automake autoconf libtool subversion \ libatlas-base-dev diff --git a/misc/docker/ubuntu/Dockerfile b/misc/docker/ubuntu/Dockerfile index 6e2bc5def92..a87330b30fc 100644 --- a/misc/docker/ubuntu/Dockerfile +++ b/misc/docker/ubuntu/Dockerfile @@ -7,7 +7,7 @@ ENV CPU_CORE 4 RUN \ apt-get update -qq && \ apt-get install -y \ - git bzip2 wget \ + git bzip2 wget sox \ g++ make python python3 \ zlib1g-dev automake autoconf libtool subversion \ libatlas-base-dev diff --git a/scripts/rnnlm/choose_features.py b/scripts/rnnlm/choose_features.py index 0686c8f88c6..842cafb3c97 100755 --- a/scripts/rnnlm/choose_features.py +++ b/scripts/rnnlm/choose_features.py @@ -11,7 +11,7 @@ sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script chooses the sparse feature representation of words. " "To be more specific, it chooses the set of features-- you compute " diff --git a/scripts/rnnlm/get_special_symbol_opts.py b/scripts/rnnlm/get_special_symbol_opts.py index 83f7d708a49..4310b116ad7 100755 --- a/scripts/rnnlm/get_special_symbol_opts.py +++ b/scripts/rnnlm/get_special_symbol_opts.py @@ -9,7 +9,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script checks whether the special symbols " "appear in words.txt with expected values, if not, it will " diff --git a/scripts/rnnlm/get_unigram_probs.py b/scripts/rnnlm/get_unigram_probs.py index abb8515f330..ab3f9bb382f 100755 --- a/scripts/rnnlm/get_unigram_probs.py +++ b/scripts/rnnlm/get_unigram_probs.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script gets the unigram probabilities of words.", epilog="E.g. " + sys.argv[0] + " --vocab-file=data/rnnlm/vocab/words.txt " diff --git a/scripts/rnnlm/get_vocab.py b/scripts/rnnlm/get_vocab.py index e30ce4a94c9..5036db0ed2a 100755 --- a/scripts/rnnlm/get_vocab.py +++ b/scripts/rnnlm/get_vocab.py @@ -9,7 +9,7 @@ sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script get a vocab from unigram counts " "of words produced by get_unigram_counts.sh", diff --git a/scripts/rnnlm/get_word_features.py b/scripts/rnnlm/get_word_features.py index 54d84077060..aeb7a3ec6ae 100755 --- a/scripts/rnnlm/get_word_features.py +++ b/scripts/rnnlm/get_word_features.py @@ -10,7 +10,7 @@ from collections import defaultdict import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script turns the words into the sparse feature representation, " "using features from rnnlm/choose_features.py.", diff --git a/scripts/rnnlm/prepare_rnnlm_dir.sh b/scripts/rnnlm/prepare_rnnlm_dir.sh index 1de91bb7232..e6016701916 100755 --- a/scripts/rnnlm/prepare_rnnlm_dir.sh +++ b/scripts/rnnlm/prepare_rnnlm_dir.sh @@ -23,7 +23,7 @@ if [ $# != 3 ]; then echo "Usage: $0 [options] " echo "Sets up the directory for RNNLM training as done by" echo "rnnlm/train_rnnlm.sh, and initializes the model." - echo " is as validated by rnnlm/validate_data_dir.py" + echo " is as validated by rnnlm/validate_text_dir.py" echo " is as validated by rnnlm/validate_config_dir.sh." exit 1 fi diff --git a/scripts/rnnlm/prepare_split_data.py b/scripts/rnnlm/prepare_split_data.py index e39f4504f37..cceac48313e 100755 --- a/scripts/rnnlm/prepare_split_data.py +++ b/scripts/rnnlm/prepare_split_data.py @@ -9,7 +9,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script prepares files containing integerized text, " "for consumption by nnet3-get-egs.", diff --git a/scripts/rnnlm/show_word_features.py b/scripts/rnnlm/show_word_features.py index 5fe049cb8ce..53d4729b4bb 100755 --- a/scripts/rnnlm/show_word_features.py +++ b/scripts/rnnlm/show_word_features.py @@ -9,7 +9,7 @@ sys.stdout = open(1, 'w', encoding='latin-1', closefd=False) import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="This script turns the word features to a human readable format.", epilog="E.g. " + sys.argv[0] + "exp/rnnlm/word_feats.txt exp/rnnlm/features.txt " diff --git a/scripts/rnnlm/validate_features.py b/scripts/rnnlm/validate_features.py index 010ceb72615..2a077da4758 100755 --- a/scripts/rnnlm/validate_features.py +++ b/scripts/rnnlm/validate_features.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="Validates features file, produced by rnnlm/choose_features.py.", epilog="E.g. " + sys.argv[0] + " exp/rnnlm/features.txt", diff --git a/scripts/rnnlm/validate_text_dir.py b/scripts/rnnlm/validate_text_dir.py index 4b311a8abbd..903e720bdf4 100755 --- a/scripts/rnnlm/validate_text_dir.py +++ b/scripts/rnnlm/validate_text_dir.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="Validates data directory containing text " "files from one or more data sources, including dev.txt.", diff --git a/scripts/rnnlm/validate_word_features.py b/scripts/rnnlm/validate_word_features.py index f8eb5858d95..205b934ae1b 100755 --- a/scripts/rnnlm/validate_word_features.py +++ b/scripts/rnnlm/validate_word_features.py @@ -8,7 +8,7 @@ import sys import re -tab_or_space = re.compile('[ \t]') +tab_or_space = re.compile('[ \t]+') parser = argparse.ArgumentParser(description="Validates word features file, produced by rnnlm/get_word_features.py.", epilog="E.g. " + sys.argv[0] + " --features-file=exp/rnnlm/features.txt " diff --git a/src/bin/acc-lda.cc b/src/bin/acc-lda.cc index 92cd192b9a6..b664135bdc7 100644 --- a/src/bin/acc-lda.cc +++ b/src/bin/acc-lda.cc @@ -37,7 +37,7 @@ int main(int argc, char *argv[]) { "Accumulate LDA statistics based on pdf-ids.\n" "Usage: acc-lda [options] \n" "Typical usage:\n" - " ali-to-post ark:1.ali ark:- | lda-acc 1.mdl \"ark:splice-feats scp:train.scp|\" ark:- ldaacc.1\n"; + " ali-to-post ark:1.ali ark:- | acc-lda 1.mdl \"ark:splice-feats scp:train.scp|\" ark:- ldaacc.1\n"; bool binary = true; BaseFloat rand_prune = 0.0; @@ -126,5 +126,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/chainbin/nnet3-chain-train.cc b/src/chainbin/nnet3-chain-train.cc index 9ea7ba1b06f..536669a17d3 100644 --- a/src/chainbin/nnet3-chain-train.cc +++ b/src/chainbin/nnet3-chain-train.cc @@ -20,6 +20,7 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "nnet3/nnet-chain-training.h" +#include "cudamatrix/cu-allocator.h" int main(int argc, char *argv[]) { @@ -52,6 +53,7 @@ int main(int argc, char *argv[]) { "yes|no|optional|wait, only has effect if compiled with CUDA"); opts.Register(&po); + RegisterCuAllocatorOptions(&po); po.Read(argc, argv); diff --git a/src/configure b/src/configure index 2f506b9073c..41262259165 100755 --- a/src/configure +++ b/src/configure @@ -42,7 +42,7 @@ # This should be incremented after any significant change to the configure # script, i.e. any change affecting kaldi.mk or the build system as a whole. -CONFIGURE_VERSION=7 +CONFIGURE_VERSION=8 if ! [ -x "$PWD/configure" ]; then echo 'You must run "configure" from the src/ directory.' @@ -367,7 +367,7 @@ function linux_configure_mkl_threading { function configure_cuda { # Check for CUDA toolkit in the system if [ ! -d "$CUDATKDIR" ]; then - for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do + for base in /usr/local/share/cuda /usr/local/cuda /usr/; do if [ -f $base/bin/nvcc ]; then CUDATKDIR=$base fi @@ -395,14 +395,6 @@ function configure_cuda { GCC_VER=$($COMPILER -dumpversion) GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d") case $CUDA_VERSION in - 5_5) - MIN_UNSUPPORTED_GCC_VER="5.0" - MIN_UNSUPPORTED_GCC_VER_NUM=50000; - ;; - 6_*) - MIN_UNSUPPORTED_GCC_VER="5.0" - MIN_UNSUPPORTED_GCC_VER_NUM=50000; - ;; 7_*) MIN_UNSUPPORTED_GCC_VER="5.0" MIN_UNSUPPORTED_GCC_VER_NUM=50000; @@ -454,6 +446,8 @@ function configure_cuda { else cat makefiles/cuda_64bit.mk >> kaldi.mk fi + elif [ "`uname -m`" == "aarch64" ]; then + cat makefiles/cuda_64bit.mk >> kaldi.mk elif [ "`uname -m`" == "ppc64le" ]; then cat makefiles/cuda_64bit.mk >> kaldi.mk else diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc index fec75b01a3f..cfbc6757530 100644 --- a/src/cudamatrix/cu-allocator.cc +++ b/src/cudamatrix/cu-allocator.cc @@ -1,6 +1,6 @@ // cudamatrix/cu-allocator.cc -// Copyright 2015 Johns Hopkins University (author: Daniel Povey) +// Copyright 2015-2018 Johns Hopkins University (author: Daniel Povey) // See ../../COPYING for clarification regarding multiple authors // @@ -19,6 +19,8 @@ +#include "cudamatrix/cu-allocator.h" + #if HAVE_CUDA == 1 #include @@ -28,6 +30,10 @@ #include #include #include +#ifndef _MSC_VER +#include +#endif + #include "cudamatrix/cu-common.h" #include "cudamatrix/cu-device.h" #include "cudamatrix/cu-matrix.h" @@ -39,10 +45,207 @@ namespace kaldi { void* CuMemoryAllocator::Malloc(size_t size) { - // For now just call MallocPitch and throw away the pitch, to avoid - // duplicating code here. Apparently the time difference is quite small. - size_t pitch; - return MallocPitch(size, 1, &pitch); + Timer tim; + if (!opts_.cache_memory) { + void *ans; + CU_SAFE_CALL(cudaMalloc(&ans, size)); + double elapsed = tim.Elapsed(); + tot_time_taken_ += elapsed; + malloc_time_taken_ += elapsed; + t_++; + return ans; + } + + // We could perhaps change this to KALDI_PARANOID_ASSERT to save time. + KALDI_ASSERT(size != 0); + + // Round up 'size' to a multiple of 256; this ensures the right kind of + // memory alignment. + size = (size + 255) & ~((size_t)255); + void *ans = MallocInternal(size); + tot_time_taken_ += tim.Elapsed(); + return ans; +} + + +CuMemoryAllocator::MemoryBlock *CuMemoryAllocator::SplitBlock( + MemoryBlock *block, size_t size) { + SubRegion *subregion = block->subregion; + // new_block will become the right-most part of 'block', and 'block' will + // be the left-most part. + MemoryBlock *new_block = new MemoryBlock; + bool return_new_block; + char *new_begin; + + // We now decide whether to make the left part of 'block' be of size ('size') + // and return it (the 'if' branch of the if-else block below), or the right + // part (the 'else' branch). We decide this based on heuristics. Basically, + // we want to allocate the sub-block that's either next to the edge of the + // MemoryRegion, or next to something that was allocated long ago (and which, + // we assume won't be deallocated for a relatively long time). That is: we + // want to leave the un-allocated memory next to a memory block that was + // recently allocated (and thus is likely to be freed sooner), so that when + // that block is freed we can merge it with the still-unallocated piece into a + // larger block; this will reduce fragmentation. But if this block spans + // multiple sub-regions we don't want to do that, as that would be against our + // heuristic of, where possible, allocating memory from lower-numbered + // sub-regions. + // + // Bear in mind that we can assume block->next and block->prev, if they are + // non-NULL, are both currently allocated, since 'block' is un-allocated and + // we would have merged any adjacent un-allocated sub-regions. + if (block->next != NULL && block->prev != NULL && + block->prev->t < block->next->t && + block->next->subregion == subregion) { + // We'll allocate the right part of the block, since the left side is next + // to a relatively recently-allocated block. + return_new_block = true; + new_begin = block->end - size; + } else { + // We'll allocate the left part of the block. + return_new_block = false; + new_begin = block->begin + size; + } + + // The following code makes sure the SubRegion for 'new_block' is correct, + // i.e. its 'begin' is >= the 'begin' of the subregion and < the 'end' of the + // subregion. If the following loop segfaults, it indicates a bug somewhere + // else. + while (new_begin >= subregion->end) + subregion = subregion->next; + MemoryBlock *next_block = block->next; + new_block->begin = new_begin; + new_block->end = block->end; + new_block->subregion = subregion; + new_block->allocated = false; + new_block->thread_id = block->thread_id; + new_block->t = block->t; + new_block->next = next_block; + new_block->prev = block; + if (next_block) + next_block->prev = new_block; + block->next = new_block; + block->end = new_begin; + + // Add the split-up piece that we won't be allocating, to the + // 'free_blocks' member of its subregion. + if (return_new_block) { + AddToFreeBlocks(block); + return new_block; + } else { + AddToFreeBlocks(new_block); + return block; + } +} + + +void CuMemoryAllocator::RemoveFromFreeBlocks(MemoryBlock *block) { + SubRegion *subregion = block->subregion; + size_t block_size = block->end - block->begin; + std::pair p(block_size, block); + size_t num_removed = subregion->free_blocks.erase(p); + KALDI_ASSERT(num_removed != 0); + // Update largest_free_block_, if needed. + size_t subregion_index = subregion->subregion_index; + if (block_size == largest_free_block_[subregion_index]) { + if (subregion->free_blocks.empty()) + largest_free_block_[subregion_index] = 0; + else + largest_free_block_[subregion_index] = + subregion->free_blocks.begin()->first; + } +} + +void CuMemoryAllocator::AddToFreeBlocks(MemoryBlock *block) { + SubRegion *subregion = block->subregion; + KALDI_PARANOID_ASSERT(block->begin >= subregion->begin && + block->begin < subregion->end); + size_t block_size = block->end - block->begin, + subregion_index = subregion->subregion_index; + // Update largest_free_block_, if needed. + if (block_size > largest_free_block_[subregion_index]) { + largest_free_block_[subregion_index] = block_size; + } + subregion->free_blocks.insert(std::pair(block_size, block)); +} + + +void* CuMemoryAllocator::MallocFromSubregion(SubRegion *subregion, + size_t size) { + // NULL is implementation defined and doesn't have to be zero so we can't + // guarantee that NULL will be <= a valid pointer-- so we cast to a pointer + // from zero instead of using NULL. + std::pair p(size, (MemoryBlock*)0); + + std::set >::iterator iter = + subregion->free_blocks.lower_bound(p); + // so now 'iter' is the first member of free_blocks whose size_t value is >= + // size. If 'iter' was equal to the end() of that multi_map, it would be a + // bug because the calling code checked that the largest free block in this + // region was sufficiently large. We don't check this; if it segfaults, we'll + // debug. + + MemoryBlock *block = iter->second; + // Erase 'block' from its subregion's free blocks list... the next lines are + // similar to RemoveFromFreeBlocks(), but we code it directly as we have the + // iterator here, and it would be wasteful to do another lookup. + subregion->free_blocks.erase(iter); + // Update largest_free_block_, if needed. The following few lines of code also appear + // in RemoveFromFreeBlocks(). + size_t block_size = block->end - block->begin, + subregion_index = subregion->subregion_index; + if (block_size == largest_free_block_[subregion_index]) { + if (subregion->free_blocks.empty()) + largest_free_block_[subregion_index] = 0; + else + largest_free_block_[subregion_index] = + subregion->free_blocks.begin()->first; + } + + KALDI_PARANOID_ASSERT(block_size >= size && block->allocated == false); + + // the most memory we allow to be 'wasted' by failing to split a block, is the + // smaller of: 1/16 of the size we're allocating, or half a megabyte. + size_t allowed_extra_size = std::min(size >> 4, 524288); + if (block_size > size + allowed_extra_size) { + // If the requested block is substantially larger than what was requested, + // split it so we don't waste memory. + block = SplitBlock(block, size); + } + + if (std::this_thread::get_id() != block->thread_id && + block->t > synchronize_gpu_t_) { + // see NOTE ON SYNCHRONIZATION in the header. + SynchronizeGpu(); + synchronize_gpu_t_ = t_; + num_synchronizations_++; + } + block->allocated = true; + block->t = t_; + allocated_block_map_[block->begin] = block; + return block->begin; +} + +// By the time MallocInternal is called, we will have ensured that 'size' is +// a nonzero multiple of 256 (for memory aligment reasons). +// inline +void* CuMemoryAllocator::MallocInternal(size_t size) { +start: + std::vector::const_iterator iter = largest_free_block_.begin(), + end = largest_free_block_.end(); + size_t subregion_index = 0; + for (; iter != end; ++iter, ++subregion_index) { + if (*iter > size) { + return MallocFromSubregion(subregions_[subregion_index], size); + } + } + // We dropped off the loop without finding a subregion with enough memory + // to satisfy the request -> allocate a new region. + AllocateNewRegion(size); + // An infinite loop shouldn't be possible because after calling + // AllocateNewRegion(size), there should always be a SubRegion + // with that size available. + goto start; } // Returns max(0, floor(log_2(i))). Not tested independently. @@ -63,311 +266,341 @@ static inline size_t IntegerLog2(size_t i) { return ans; } -//inline -CuMemoryAllocator::MruCache& CuMemoryAllocator::GetCacheForSize( - size_t num_bytes) { - size_t bucket_index = IntegerLog2(num_bytes); - KALDI_ASSERT(num_bytes > 0 && bucket_index < caches_.size()); - return caches_[bucket_index]; -} - -//inline -void* CuMemoryAllocator::MallocPitchInternal(size_t row_bytes, - size_t num_rows, - size_t *pitch) { - num_system_allocations_++; - void *ans; - cudaError_t e; - for (int32 i = 0; i <= 2; i++) { - if (num_rows != 1) { - CuTimer tim; - e = cudaMallocPitch(&ans, pitch, row_bytes, num_rows); - tot_time_taken_in_cuda_malloc_pitch_ += tim.Elapsed(); +std::string GetFreeGpuMemory(int64* free, int64* total) { +#ifdef _MSC_VER + size_t mem_free, mem_total; + cuMemGetInfo_v2(&mem_free, &mem_total); +#else + // define the function signature type + size_t mem_free, mem_total; + { + // we will load cuMemGetInfo_v2 dynamically from libcuda.so + // pre-fill ``safe'' values that will not cause problems + mem_free = 1; mem_total = 1; + // open libcuda.so + void* libcuda = dlopen("libcuda.so", RTLD_LAZY); + if (NULL == libcuda) { + KALDI_WARN << "cannot open libcuda.so"; } else { - CuTimer tim; - // we might save a little time this way. - e = cudaMalloc(&ans, row_bytes); - tot_time_taken_in_cuda_malloc_ += tim.Elapsed(); - *pitch = row_bytes; - } - if (e != cudaSuccess) { - PrintMemoryUsage(); - // On the first 2 out of the 3 iters, try freeing memory. - if (i <= 1) { - KALDI_WARN << "Allocation of " << row_bytes << " x " - << num_rows << " region failed: freeing some memory and " - << "trying again. "; - BaseFloat new_memory_factor = 1.1; - if (opts_.memory_factor > new_memory_factor) { - KALDI_LOG << "To avoid future problems like this, changing " - << "memory_factor from " << opts_.memory_factor << " to " - << new_memory_factor; - opts_.memory_factor = new_memory_factor; - } - size_t memory_cached = MemoryCached(), - memory_requested = row_bytes * num_rows, - memory_to_free = std::max(memory_cached / 2, - std::min(memory_cached, - memory_requested)); - FreeSomeCachedMemory(memory_to_free); + // define the function signature type + // and get the symbol + typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); + cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); + if (NULL == dl_cuMemGetInfo) { + KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so"; } else { - KALDI_ERR << "Cannot allocate the requested memory (" - << row_bytes << " x " << num_rows << " = " - << row_bytes * num_rows << " bytes)"; + // call the function + dl_cuMemGetInfo(&mem_free, &mem_total); } - cudaGetLastError(); // Clear the error state. - } else { - break; + // close the library + dlclose(libcuda); } } - return ans; +#endif + // copy the output values outside + if (NULL != free) *free = mem_free; + if (NULL != total) *total = mem_total; + // prepare the text output + std::ostringstream os; + os << "free:" << mem_free/(1024*1024) << "M, " + << "used:" << (mem_total-mem_free)/(1024*1024) << "M, " + << "total:" << mem_total/(1024*1024) << "M, " + << "free/total:" << mem_free/(float)mem_total; + return os.str(); } void CuMemoryAllocator::PrintMemoryUsage() const { - KALDI_LOG << "Memory usage: " << cur_bytes_allocated_ - << " bytes currently allocated (max: " - << max_bytes_allocated_ << "); " << cur_bytes_used_ - << " currently in use by user (max: " << max_bytes_used_ << ")" - << "; " << num_system_allocations_ << '/' - << num_user_allocations_ << " calls to Malloc* resulted in " - << "CUDA calls."; - if (GetVerboseLevel() >= 1) { - // CuTimer only accumulates stats at verbose level 1 or above. - KALDI_LOG << "Time taken in cudaMallocPitch=" << tot_time_taken_in_cuda_malloc_pitch_ - << ", in cudaMalloc=" << tot_time_taken_in_cuda_malloc_ - << ", in cudaFree=" << tot_time_taken_in_cuda_free_ - << ", in this->MallocPitch()=" << tot_time_taken_in_malloc_pitch_; + if (!opts_.cache_memory) { + KALDI_LOG << "Not caching allocations; time taken in " + << "malloc/free is " << malloc_time_taken_ + << "/" << (tot_time_taken_ - malloc_time_taken_) + << ", num operations is " << t_ + << "; device memory info: " + << GetFreeGpuMemory(NULL, NULL); + return; } + + size_t num_blocks_allocated = 0, num_blocks_free = 0, + memory_allocated = 0, memory_held = 0, + largest_free_block = 0, largest_allocated_block = 0; + + for (size_t i = 0; i < memory_regions_.size(); i++) { + MemoryBlock *m = memory_regions_[i].block_begin; + KALDI_ASSERT(m->begin == memory_regions_[i].begin); + for (; m != NULL; m = m->next) { + size_t size = m->end - m->begin; + if (m->allocated) { + num_blocks_allocated++; + memory_allocated += size; + if (size > largest_allocated_block) + largest_allocated_block = size; + } else { + num_blocks_free++; + if (size > largest_free_block) + largest_free_block = size; + } + memory_held += size; + // The following is just some sanity checks; this code is rarely called so + // it's a reasonable place to put them. + if (m->next) { + KALDI_ASSERT(m->next->prev == m && m->end == m->next->begin); + } else { + KALDI_ASSERT(m->end == memory_regions_[m->subregion->memory_region].end); + } + } + } + KALDI_LOG << "Memory usage: " << memory_allocated << "/" + << memory_held << " bytes currently allocated/total-held; " + << num_blocks_allocated << "/" << num_blocks_free + << " blocks currently allocated/free; largest " + << "free/allocated block sizes are " + << largest_allocated_block << "/" << largest_free_block + << "; time taken total/cudaMalloc is " + << tot_time_taken_ << "/" << malloc_time_taken_ + << ", synchronized the GPU " << num_synchronizations_ + << " times out of " << (t_/2) << " frees; " + << "device memory info: " << GetFreeGpuMemory(NULL, NULL); +} + +// Note: we just initialize with the default options, but we can change it later +// (as long as it's before we first use the class) by calling SetOptions(). +CuMemoryAllocator::CuMemoryAllocator(): + opts_(CuAllocatorOptions()), + t_(0), + synchronize_gpu_t_(0), + num_synchronizations_(0), + tot_time_taken_(0.0), + malloc_time_taken_(0.0) { + // Note: we don't allocate any memory regions at the start; we wait for the user + // to call Malloc() or MallocPitch(), and then allocate one when needed. } -CuMemoryAllocator::CuMemoryAllocator(CuAllocatorOptions opts): - opts_(opts), - caches_(40), - cur_bytes_allocated_(0), - max_bytes_allocated_(0), - cur_bytes_used_(0), - max_bytes_used_(0), - t_(1), - num_user_allocations_(0), - num_system_allocations_(0), - tot_time_taken_in_cuda_malloc_(0.0), - tot_time_taken_in_cuda_malloc_pitch_(0.0), - tot_time_taken_in_cuda_free_(0.0), - tot_time_taken_in_malloc_pitch_(0.0) { } void* CuMemoryAllocator::MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { - CuTimer tim; - t_++; - num_user_allocations_++; - size_t requested_bytes = row_bytes * num_rows; - if (cur_bytes_used_ + requested_bytes > max_bytes_used_) - max_bytes_used_ = cur_bytes_used_ + requested_bytes; - MruCache &cache = GetCacheForSize(requested_bytes); - MemoryRequest request(row_bytes, num_rows); - CachedMemoryElement output; - if (cache.Lookup(request, &output)) { - // we have cached memory with this value. - void *ans = output.pointer; - *pitch = output.pitch; - used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, output.pitch); - cur_bytes_used_ += requested_bytes; - tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); - return ans; - } else { - // note: it's important that we already updated max_bytes_used_. - size_t next_bytes_allocated = cur_bytes_allocated_ + requested_bytes, - max_bytes_to_allocate = - static_cast(opts_.memory_factor * max_bytes_used_); - ssize_t bytes_overflow = next_bytes_allocated - max_bytes_to_allocate; - if (bytes_overflow > 0) { - // The amount we would have allocated, after fulfilling this request, - // would exceed our limits (we don't allow ourselves to allocate more than - // memory_factor times the maximum amount of memory the user ever owns - // during the lifetime of the program). So free some memory. - KALDI_ASSERT(bytes_overflow <= MemoryCached()); // sanity check. - FreeSomeCachedMemory(static_cast(bytes_overflow)); - KALDI_ASSERT(cur_bytes_allocated_ + requested_bytes <= - max_bytes_to_allocate); - } - void *ans = MallocPitchInternal(row_bytes, num_rows, pitch); - cur_bytes_allocated_ += requested_bytes; - if (cur_bytes_allocated_ > max_bytes_allocated_) - max_bytes_allocated_ = cur_bytes_allocated_; - used_map_[ans] = UsedMemoryElement(row_bytes, num_rows, *pitch); - cur_bytes_used_ += requested_bytes; - tot_time_taken_in_malloc_pitch_ += tim.Elapsed(); + Timer tim; + if (!opts_.cache_memory) { + void *ans; + CU_SAFE_CALL(cudaMallocPitch(&ans, pitch, row_bytes, num_rows)); + double elapsed = tim.Elapsed(); + tot_time_taken_ += elapsed; + malloc_time_taken_ += elapsed; return ans; } -} -void CuMemoryAllocator::FreeSomeCachedMemory(size_t bytes_to_free_in) { - CuTimer tim; - // the next few lines are responsible for increasing the amount of memory we - // are going to free, in case the user requested an amount that's very tiny - // compared with the total amount of memory ever used. This helps us - // to amortize the cost of visiting all of the buckets inside this code. - // (there are only 40 buckets so it's not so big, but we're being careful. - size_t bytes_cached = cur_bytes_allocated_ - cur_bytes_used_, - min_to_free = static_cast(max_bytes_used_ * opts_.delete_factor); - size_t bytes_to_free = std::min(bytes_cached, - std::max(bytes_to_free_in, min_to_free)), - bytes_freed = 0; - - size_t num_caches = caches_.size(), - t = t_; - // size_factor contains the approximate (power-of-two) size of the pointers - // that each cache's pointers contain. The 'cost' of keeping any given pointer, - // we declare to be the time since we last used it multiplied by the size - // of the memory in the pointer. - std::vector size_factor(num_caches); - for (size_t i = 0, j=1; i < num_caches; i++, j *= 2) - size_factor[i] = j; - - std::priority_queue > queue; - // Set up the queue. - for (int32 i = 0; i < num_caches; i++) { - const MruCache &cache = caches_[i]; - size_t cache_t = cache.LeastRecentTime(); - if (cache_t > 0) { // t == 0 means the cache is empty. - size_t interval = t - cache_t; - BaseFloat cost = size_factor[i] * interval; - KALDI_ASSERT(interval > 0); - queue.push(std::pair(cost, i)); - } - } - while (bytes_freed < bytes_to_free) { - // If the following fails it means I made some kind of bookkeeping error, - // and most likely we are trying to free more memory than we really have - // cached. - KALDI_ASSERT(!queue.empty() && "Code error."); - std::pair p = queue.top(); - int32 cache_index = p.second; - MruCache &cache = caches_[cache_index]; - queue.pop(); - if (queue.empty()) { - while (bytes_freed < bytes_to_free) { - bytes_freed += cache.RemoveLeastRecentlyUsed(); - } - } else { - BaseFloat next_worst_cost = queue.top().first; - while (1) { - bytes_freed += cache.RemoveLeastRecentlyUsed(); - if (bytes_freed >= bytes_to_free) - break; - size_t least_recent_time = cache.LeastRecentTime(); - if (least_recent_time == 0) // this cache is now empty - break; - size_t interval = t - least_recent_time; - KALDI_ASSERT(interval > 0); - BaseFloat cost = size_factor[cache_index] * interval; - if (cost < next_worst_cost) { - // There is another bucket that has worse cost than this, - // so stop processing this bucket-- but first put it - // back in the queue. - queue.push(std::pair(cost, cache_index)); - break; - } - } - } - } - KALDI_ASSERT(bytes_freed <= cur_bytes_allocated_); - cur_bytes_allocated_ -= bytes_freed; - tot_time_taken_in_cuda_free_ += tim.Elapsed(); + // Round up row_bytes to a multiple of 256. + row_bytes = (row_bytes + 255) & ~((size_t)255); + *pitch = row_bytes; + void *ans = MallocInternal(row_bytes * num_rows); + tot_time_taken_ += tim.Elapsed(); + return ans; } void CuMemoryAllocator::Free(void *ptr) { + CuTimer tim; + if (!opts_.cache_memory) { + CU_SAFE_CALL(cudaFree(ptr)); + tot_time_taken_ += tim.Elapsed(); + t_++; + return; + } t_++; - unordered_map::iterator iter = - used_map_.find(ptr); - if (iter == used_map_.end()) { + unordered_map::iterator iter = + allocated_block_map_.find(ptr); + if (iter == allocated_block_map_.end()) { KALDI_ERR << "Attempt to free CUDA memory pointer that was not allocated: " << ptr; } - const UsedMemoryElement &elem = iter->second; - size_t num_bytes = elem.row_bytes * elem.num_rows; - - cur_bytes_used_ -= num_bytes; - MruCache &cache = GetCacheForSize(num_bytes); + MemoryBlock *block = iter->second; + allocated_block_map_.erase(iter); + block->t = t_; + block->thread_id = std::this_thread::get_id(); + block->allocated = false; + + // If this is not the first block of the memory region and the previous block + // is not allocated, merge this block into the previous block. + MemoryBlock *prev_block = block->prev; + if (prev_block != NULL && !prev_block->allocated) { + RemoveFromFreeBlocks(prev_block); + prev_block->end = block->end; + if (prev_block->thread_id != block->thread_id) { + // the two blocks we're merging were freed by different threads, so we + // give the 'nonexistent thread' as their thread, which means that + // whichever thread requests that block, we force synchronization. We can + // assume that prev_block was previously allocated (prev_block->t > 0) + // because we always start from the left when allocating blocks, and we + // know that this block was previously allocated. + prev_block->thread_id = std::thread::id(); + } + prev_block->t = t_; + prev_block->next = block->next; + if (block->next) + block->next->prev = prev_block; + delete block; + block = prev_block; + } - cache.Insert(MemoryRequest(elem.row_bytes, elem.num_rows), - CachedMemoryElement(ptr, t_, elem.pitch)); - used_map_.erase(iter); + // If this is not the last block of the memory region and the next block is + // not allocated, merge the next block into this block. + MemoryBlock *next_block = block->next; + if (next_block != NULL && !next_block->allocated) { + // merge next_block into 'block', deleting 'next_block'. Note: at this + // point, if we merged with the previous block, the variable 'block' may now + // be pointing to that previous block, so it would be a 3-way merge. + RemoveFromFreeBlocks(next_block); + block->end = next_block->end; + if (next_block->thread_id != block->thread_id && next_block->t > 0) { + // the two blocks we're merging were freed by different threads, so we + // give the 'nonexistent thread' as their thread, which means that + // whichever thread requests that block, we force synchronization. there + // is no need to do this if next_block->t == 0, which would mean it had + // never been allocated. + block->thread_id = std::thread::id(); + } + // We don't need to inspect the 't' value of next_block; it can't be + // larger than t_ because t_ is now. + block->next = next_block->next; + if (block->next) + block->next->prev = block; + delete next_block; + } + AddToFreeBlocks(block); + tot_time_taken_ += tim.Elapsed(); } -size_t CuMemoryAllocator::MruCache::LeastRecentTime() const { - if (list_.empty()) { - KALDI_PARANOID_ASSERT(map_.empty()); - return 0; - } else { - const MemoryRequest &mr = list_.front(); - MapType::const_iterator iter = map_.find(mr); - KALDI_ASSERT(iter != map_.end()); - const MapValueType &queue = iter->second; - KALDI_ASSERT(!queue.empty()); - return queue.front().first.t; +void CuMemoryAllocator::AllocateNewRegion(size_t size) { + int64 free_memory, total_memory; + std::string mem_info = GetFreeGpuMemory(&free_memory, &total_memory); + opts_.Check(); + size_t region_size = static_cast(free_memory * opts_.memory_proportion); + if (region_size < size) + region_size = size; + // Round up region_size to an exact multiple of 1M (note: we expect it will + // be much larger than that). 1048575 is 2^20 - 1. + region_size = (region_size + 1048575) & ~((size_t)1048575); + + if (!memory_regions_.empty()) { + // If this is not the first region allocated, print some information. + KALDI_LOG << "About to allocate new memory region of " << region_size + << " bytes; current memory info is: " << mem_info; + } + void *memory_region; + cudaError_t e; + { + Timer tim; + e = cudaMalloc(&memory_region, region_size); + malloc_time_taken_ += tim.Elapsed(); + } + if (e != cudaSuccess) { + PrintMemoryUsage(); + if (!CuDevice::Instantiate().IsComputeExclusive()) { + KALDI_ERR << "Failed to allocate a memory region of " << region_size + << " bytes. Possibly this is due to sharing the GPU. Try " + << "switching the GPUs to exclusive mode (nvidia-smi -c 3) and using " + << "the option --use-gpu=wait to scripts like " + << "steps/nnet3/chain/train.py. Memory info: " + << mem_info; + } else { + KALDI_ERR << "Failed to allocate a memory region of " << region_size + << " bytes. Possibly smaller minibatch size would help. " + << "Memory info: " << mem_info; + } + } + // this_num_subregions would be approximately 'opts_.num_subregions' if + // 'region_size' was all the device's memory. (We add one to round up). + // We're aiming to get a number of sub-regions approximately equal to + // opts_.num_subregions by the time we allocate all the device's memory. + size_t this_num_subregions = 1 + + (region_size * opts_.num_subregions) / total_memory; + + size_t memory_region_index = memory_regions_.size(); + memory_regions_.resize(memory_region_index + 1); + MemoryRegion &this_region = memory_regions_.back(); + + this_region.begin = static_cast(memory_region); + this_region.end = this_region.begin + region_size; + // subregion_size will be hundreds of megabytes. + size_t subregion_size = region_size / this_num_subregions; + + std::vector new_subregions; + char* subregion_begin = static_cast(memory_region); + for (size_t i = 0; i < this_num_subregions; i++) { + SubRegion *subregion = new SubRegion(); + subregion->memory_region = memory_region_index; + subregion->begin = subregion_begin; + if (i + 1 == this_num_subregions) { + subregion->end = this_region.end; + KALDI_ASSERT(subregion->end > subregion->begin); + } else { + subregion->end = subregion_begin + subregion_size; + subregion_begin = subregion->end; + } + subregion->next = NULL; + if (i > 0) { + new_subregions.back()->next = subregion; + } + new_subregions.push_back(subregion); } + // Initially the memory is in a single block, owned by + // the first subregion. It will be split up gradually. + MemoryBlock *block = new MemoryBlock(); + block->begin = this_region.begin; + block->end = this_region.end; + block->subregion = new_subregions.front(); + block->allocated = false; + block->t = 0; // was never allocated. + block->next = NULL; + block->prev = NULL; + for (size_t i = 0; i < this_num_subregions; i++) + subregions_.push_back(new_subregions[i]); + SortSubregions(); + this_region.block_begin = block; + + AddToFreeBlocks(block); } -bool CuMemoryAllocator::MruCache::Lookup(const MemoryRequest &request, - CachedMemoryElement *output) { - MapType::iterator iter = map_.find(request); - if (iter == map_.end()) - return false; - MapValueType &q = iter->second; - KALDI_ASSERT(!q.empty()); - // use q.back() as we want to return the most recently used one if there - // is a choice. We believe this will give better caching behavior. - *output = q.back().first; - list_.erase(q.back().second); - q.pop_back(); - if (q.empty()) - map_.erase(request); - return true; +// We sort the sub-regions according to the distance between the start of the +// MemoryRegion of which they are a part, and the start of the SubRegion. This +// will generally mean that the highest-numbered SubRegion-- the one we keep +// free at all costs-- will be the end of the first block which we allocated +// (which under most situations will be the largest block). +void CuMemoryAllocator::SortSubregions() { + largest_free_block_.resize(subregions_.size()); + + std::vector > pairs; + for (size_t i = 0; i < subregions_.size(); i++) { + SubRegion *subregion = subregions_[i]; + MemoryRegion &memory_region = memory_regions_[subregion->memory_region]; + size_t distance = subregion->begin - memory_region.begin; + pairs.push_back(std::pair(distance, subregion)); + } + std::sort(pairs.begin(), pairs.end()); + for (size_t i = 0; i < subregions_.size(); i++) { + subregions_[i] = pairs[i].second; + subregions_[i]->subregion_index = i; + if (subregions_[i]->free_blocks.empty()) + largest_free_block_[i] = 0; + else + largest_free_block_[i] = subregions_[i]->free_blocks.begin()->first; + } } -void CuMemoryAllocator::MruCache::Insert(const MemoryRequest &request, - const CachedMemoryElement &element) { - list_.push_back(request); - map_[request].push_back(std::pair( - element, - --list_.end())); -} +CuMemoryAllocator g_cuda_allocator; -size_t CuMemoryAllocator::MruCache::RemoveLeastRecentlyUsed() { - // Remove least-recently-used element from cache. - KALDI_ASSERT(!list_.empty()); - MemoryRequest request = list_.front(); - MapType::iterator iter = map_.find(request); - KALDI_ASSERT(iter != map_.end()); - MapValueType &queue = iter->second; - KALDI_ASSERT(!queue.empty()); - // least recently used elements are at the front of the queue. - std::pair &p = queue.front(); - KALDI_ASSERT(p.second == list_.begin()); - CU_SAFE_CALL(cudaFree(p.first.pointer)); - queue.pop_front(); - if (queue.empty()) - map_.erase(request); - list_.pop_front(); - return request.first * request.second; -} -CuMemoryAllocator::MruCache& CuMemoryAllocator::MruCache::operator = ( - const CuMemoryAllocator::MruCache &other) { - KALDI_ASSERT(other.list_.empty()); - return *this; -} -CuMemoryAllocator::MruCache::MruCache( - const CuMemoryAllocator::MruCache &other) { - KALDI_ASSERT(other.list_.empty()); -} +} // namespace kaldi +#endif // HAVE_CUDA -} +namespace kaldi { +// Define/initialize this global variable. It was declared in cu-allocator.h. +// This has to be done outside of the ifdef, because we register the options +// whether or not CUDA is compiled in (so that the binaries accept the same +// options). +CuAllocatorOptions g_allocator_options; -#endif // HAVE_CUDA +} diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h index 0f96315e848..20425704a2b 100644 --- a/src/cudamatrix/cu-allocator.h +++ b/src/cudamatrix/cu-allocator.h @@ -23,54 +23,137 @@ #define KALDI_CUDAMATRIX_CU_ALLOCATOR_H_ #if HAVE_CUDA == 1 - #include +#include +#include +#endif + #include +#include #include #include #include +#include #include -#include -#include #include "base/kaldi-common.h" #include "util/stl-utils.h" +#include "itf/options-itf.h" namespace kaldi { // For now we don't give the user a way to modify these from the command line. +// or the code, it just documents what the default options are. To change +// the options, you have to do it in the code. struct CuAllocatorOptions { - // memory_factor is the total amount of (allocated + cached) memory that we - // allow to be held, relative to the max amount of memory the program has ever - // allocated. It will increase the amount of memory the program will - // potentially consume, by this factor. - BaseFloat memory_factor; - - // This is the minimum amount of memory that we will delete when we are forced - // to delete stuff, relative to the max amount of memory the program has ever - // allocated. This should be less than memory_factor - 1.0 and > 0. It - // shouldn't be too critical. The reason it exists is to avoid calling the - // cleanup code and only releasing very small amounts of memory, because there - // is a constant overhead proportional to the number of buckets. - BaseFloat delete_factor; - - CuAllocatorOptions(): memory_factor(1.3), - delete_factor(0.001) { } + // True if we are going to actually cache memory allocations on this device. + // You'd normally set it to false only if you wanted to debug a possible + // memory problem using cuda-memcheck or cuda-gdb. It will be slower, but + // using CUDA's native allocator allows those tools to detect out-of-region + // memory accesses. + bool cache_memory; + + // The proportion of the device's memory that the CuAllocator allocates to + // start with; by default this is 0.8, although if you want to share the + // device (not recommended!) you should set this lower. + BaseFloat memory_proportion; + + // The target number of subregions of the entire CUDA device memory (we'll + // start with a smaller number of memory_proportion is << 1). Kind of + // a tuning knob.. more regions will make it more aggressively consolidate + // memory low addresses. + int32 num_subregions; + + CuAllocatorOptions(): + cache_memory(true), memory_proportion(0.5), num_subregions(20) { } + + void Register(OptionsItf *po) { + po->Register("cuda-cache-memory", &cache_memory, "True if you want " + "to use the caching allocator. Set this to false only if you " + "want to use cuda-memcheck or cuda-gdb; it will be slower."); + po->Register("cuda-memory-proportion", &memory_proportion, + "Proportion of the GPU device memory that the allocator " + "should allocate at the start"); + } void Check() { - KALDI_ASSERT(delete_factor < memory_factor - 1.0 && delete_factor > 0.0); + // don't let it get too close to 1; + KALDI_ASSERT(memory_proportion >= 0.05 && memory_proportion < 0.99); } }; +extern CuAllocatorOptions g_allocator_options; + +inline void RegisterCuAllocatorOptions(OptionsItf *po) { + g_allocator_options.Register(po); +} +} // namespace kaldi + + +#if HAVE_CUDA == 1 +namespace kaldi { + +/** + This class allocates large regions of memory from the GPU and allocates + sub-blocks of it for the user. This is needed because the CUDA malloc and + free routines are very slow. + + The user doesn't access this class directly, it is accessed via the CuDevice + object. The CuDevice class allocates memory using this class's Malloc() and + MallocPitch() functions, and frees them with its Free() function, and this + class caches the memory blocks to avoid calling the CUDA library's + malloc/free functions too often. If the application is using multiple + threads, it's necessary to lock this class before using it, and in that case + the CuDevice class calls the MallocLocking() and MallocPitchLocking() + versions of the allocation functions (but the user should call + CuDevice::AllowMultithreading() if the application plans to use GPU + functionality from multiple CPU threads). + + NOTE ON SYNCHRONIZATION: if multiple CUDA streams are used there is a + potential problem with any caching allocator which shares its pool across + CUDA streams. That is: if a memory block is freed by stream 1 and allocated to + stream 2, an operation might start in stream 2 before stream 1 has finished + working with that memory location. We solve this here using a rather low-tech + solution, relying on calling SynchronizeGpu() which submits a no-op kernel + into the legacy default stream. Each + time CuMemoryAllocator()::Free() is called and we cache the memory block + in this class, we record the thread-id of the CPU thread from which it was + freed, as well as a timestamp (the t_ member of CuMemoryAllocator, which + we increment every time the class is used). When we allocate memory + that was cached, we try to allocate it from a block that was relased by the + same CPU thread; and if that is not possible and we haven't called + SynchronizeGpu() since the block was freed, then we call + SynchronizeGpu(). The hope is that this will happen quite rarely. + Note that this is based on the assumption that the user is using the + per-thread default stream (indeed this is how we compile). If the + user were to make explicit use of CUDA streams, this mechanism would + not necessarily be sufficient to prevent data-race conditions and the + user might have to take further precautions. + + NOTE ON FRAGMENTATION: Memory fragmentation is one of the main problems that + you'll run into with allocators like this. This allocator will allocate a + small number of large regions of memory, and allocate smaller pieces of + memory that it splits off from the regions as needed. It will always merge + adjacent blocks as much as it can when the user frees memory. The main + heuristic to avoid memory fragmenting too much is that it always allocates, + where possible, from memory that's as close as possible to the start of a + memory region. This will tend to keep all the small allocations together at + the beginning of the memory region, and hopefully keep large blocks availale + at the end. The mechanism to always allocate from as close as possible to + the start of the memory region, is that we split up the memory regions into + a small number of sub-regions and, when handling a request for allocation, + allocate it from the lowest-numbered sub-region that can meet a request for + that size. (Note: we can allocate blocks that span sub-regions, so this + approach does not limit the block size we can allocate). + +*/ -// Class that caches memory for us (the CUDA -// malloc and free routines are very slow). -// This is a member of the CuDevice class. class CuMemoryAllocator { public: - /// Allocates memory on the CUDA device, of size 'size'. + /// Allocates memory on the CUDA device, of size 'size'. size == 0 is not + /// allowed and is an error. void* Malloc(size_t size); /// Allocation function for matrix-like things. @@ -95,156 +178,174 @@ class CuMemoryAllocator { Free(ptr); } + void PrintMemoryUsage() const; - // the maximum amount of memory that was ever allocated in the lifetime of the - // program, in bytes. - size_t MaxMemoryAllocated() const { return max_bytes_allocated_; } - - // memory held in the cache currently, in bytes. - size_t MemoryCached() const { return cur_bytes_allocated_ - cur_bytes_used_; } - - // memory that's cached plus memory that's allocated, in bytes. - size_t MemoryAllocated() const { return cur_bytes_allocated_; } + CuMemoryAllocator(); - void PrintMemoryUsage() const; + // Allows you to set options: must be called before any Malloc function is + // called on this class. It's done this way so the options can be changed + // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read. + void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; } - CuMemoryAllocator(CuAllocatorOptions opts); private: - void FreeSomeCachedMemory(size_t bytes_to_free); + struct SubRegion; + + struct MemoryBlock { + char *begin; // The beginning of the block (in CUDA memory) + char *end; // the end of the block (in CUDA memory) + SubRegion *subregion; // Pointer to the SubRegion to which this memory + // block belongs. + bool allocated; // True if this MemoryBlock has currently been given to the + // user; false if not. + + size_t t; // Zero if this memory block was never given to the user; + // otherwise, the time value (t_ in the CuAllocator class) + // when it was most recently either allocated to the user + // or freed by the user. + + std::thread::id thread_id; // If allocated == false and t > 0 (i.e. this + // memory block was released by the user), the + // thread-id of the user thread that freed this + // block, or the invalid thread-id as created by + // the constructor of std::thread::id if this + // block was created by merging blocks from + // different threads. Required for + // synchronization; and note that we assume + // there is one CUDA stream per CPU thread. + + MemoryBlock *next; // The next MemoryBlock within this MemoryRegion (or + // NULL if this is the last one); its 'begin' would be + // the same as the 'end' of this block. + MemoryBlock *prev; // The previous MemoryBlock within this MemoryRegion (or + // NULL if this is the first one); its 'end' would be the + // same as the 'begin' of this block. - // This calls CudaMallocPitch, checks for errors (dies if it has to), and - // returns the result. It's up to the caller to do all the bookkeeping though. - inline void* MallocPitchInternal(size_t row_bytes, size_t num_rows, size_t *pitch); + }; - typedef std::pair MemoryRequest; // (row_bytes, num_rows). - struct CachedMemoryElement { - void *pointer; // the CUDA memory location that we own - size_t t; // time value when we put this in the cache. - size_t pitch; // pitch of this memory region (c.f. cudaMallocPitch()). - CachedMemoryElement() { } - CachedMemoryElement(void *pointer, size_t t, size_t pitch): - pointer(pointer), t(t), pitch(pitch) { } + // a MemoryRegion is a large piece of memory that we allocated via CudaMalloc. + // there normally won't be more than about 3 or 4 of these. + // We'll identify MemoryRegions by a size_t (e.g 0, 1, 2, 3... ) which is an + // index into the memory_regions_ vector. + struct MemoryRegion { + char *begin; // 'begin' is the start of the memory region. + char *end; // 'end' is the end of the memory region. + SubRegion *subregion_begin; // The first SubRegion that belongs to this + // MemoryRegion. + MemoryBlock *block_begin; // The first MemoryBlock that belongs to this + // MemoryRegion. }; - // This class caches a map from MemoryRequest to a list of CachedMemoryElements, - // and gives us access to the least-recently-used element for efficient. - // removal. - // We will have an instance of this class for each power-of-2 of size in - // bytes. This makes it easier to, when we need to delete something, find - // the item for which the (time-since-used * size-in-bytes) is approximately - // greatest. - class MruCache { - public: - size_t LeastRecentTime() const; // t value of least recent CachedMemoryElement (0 - // if empty). - - size_t RemoveLeastRecentlyUsed(); // Remove least-recently-used element - // from cache. Return size in bytes of - // that removed memory region. Crash if - // this was empty. - - // Attempts lookup of the most recently cached element corresponding to - // 'request'. If available, removes it from the cache and puts it to - // 'output', and returns true. Otherwise returns false. - bool Lookup(const MemoryRequest &request, - CachedMemoryElement *output); - - // Inserts this CachedMemoryElement to the list of CachedMemoryElements for this - // MemoryRequest. The time in the CachedMemoryElement is expected to be greater - // than times in previously supplied CachedMemoryElements. - void Insert(const MemoryRequest &request, - const CachedMemoryElement &element); - - struct MemoryRequestHasher { - // input is interpreted as (row_bytes, num_rows). row_bytes will always - // be a multiple of 4, and num_rows will frequently be a multiple of - // powers of 2 also. We need to shift right and add so that there will be - // some action in the lower-order bits. - size_t operator () (const std::pair &p) const noexcept { - size_t temp = p.first + 1867 * p.second; - return temp + (temp >> 2) + (temp >> 8); - } - }; - - MruCache() { } - // Define these to make inclusion in std::vector possible, but make them - // fail if called on anything but empty cache objects-- we never resize - // the vector of caches after initializing it. - MruCache &operator = (const MruCache &other); - MruCache(const MruCache &other); - private: - typedef std::list ListType; - typedef std::list::iterator ListIterType; - typedef std::deque > MapValueType; - typedef unordered_map MapType; - // 'list_' contains MemoryRequests with the most recent on the back (where they are added), - // and least recent on the front (where they are removed by RemoveLeastRecentlyUsed, although - // they are also removed from random parts of the list by Lookup(). - // There will in general be duplicates of MemoryRequests in the list, as - // many as there are entries in the MapValueType. - ListType list_; - // 'map_' maps from a MemoryRequest to a queue of (memory-element, - // iterator), with the most-recently-added things at the back; we remove - // things from the front of these queues (oldest) inside - // RemoveLeastRecentlyUsed(), and from the back (newest) in Lookup. - MapType map_; + // a SubRegion is a smaller zone of memory within a MemoryRegion. For + // example, we divide the first MemoryRegion we allocate into 10 blocks, and + // if we allocate blocks of memory later on, we'll sub-divide them into blocks + // of about the same size. A SubRegion is just a largish bin into which we + // put any blocks of memory that happen to start within that SubRegion; + // actually, memory blocks may cross over the boundaries of SubRegions. The + // motivation for dividing up MemoryRegions into SubRegions is that it allos + // us an efficient mechanism to segregate smaller memory blocks into higher + // memory and larger ones into lower memory: for each allocation, we allocate + // it from the highest-numbered SubRegion that is able to allocate something of + // that size. Over time, this will lead to smaller memory blocks being + // concentrated in higher-numbered SubRegions. + struct SubRegion { + size_t memory_region; // This is an index into the memory_regions_ vector + // which identifies which MemoryRegion this SubRegion + // is a part of. + size_t subregion_index; // The index of this SubRegion within the + // subregions_ vector; this can change when we + // allocate more MemoryRegions. + char *begin; // 'begin' is the start of the memory in this SubRegion. + char *end; // 'end' is the end of the memory in this SubRegion. + + // Contains the free MemoryBlocks starting within this SubRegion. + std::set > free_blocks; + + // Pointer to the next SubRegion within this MemoryRegion (i.e. the SubRegion + // whose begin equals this one's end), or NULL if this is the last one. + SubRegion *next; }; + // Tries to allocate CUDA memory of the given size; will crash if it was not + // able to. + inline void* MallocInternal(size_t size); - inline MruCache &GetCacheForSize(size_t num_bytes); + // Allocates from a given SubRegion, after we have determined that it + // can satisfy this request. Broken out of MallocInternal for clarity. + inline void* MallocFromSubregion(SubRegion *subregion, size_t size); - CuAllocatorOptions opts_; - // indexed by log_2 (amount of memory requested), the caches. - std::vector caches_; + // Splits the given MemoryBlock so that one piece is of size 'size', and + // returns the piece which is of size 'size'. The caller guarantees that + // 'size' is less than the current size of the memory block, that 'block' is + // not currently allocated (i.e. block->allocated == false). This function + // assumes that, at entry, 'block' is not present in its subregion's + // 'free_blocks' (because the caller has removed it), and it takes + // responsibility for entering the 'unused' part (the part we're not + // returning) into its subregion's 'free_blocks' by calling AddToFreeBlocks(). + inline MemoryBlock *SplitBlock(MemoryBlock *block, size_t size); - size_t cur_bytes_allocated_; // number of bytes currently owned by callers or - // cached. - size_t max_bytes_allocated_; // the max over all time, of cur_bytes_allocated_. - size_t cur_bytes_used_; // number of bytes currently owned by callers. - size_t max_bytes_used_; // the max over all time, of cur_bytes_used_. - size_t t_; // time counter, incremented with each call. - size_t num_user_allocations_; // number of times user calls Malloc* - size_t num_system_allocations_; // number of times we call cudaMalloc*. - double tot_time_taken_in_cuda_malloc_; // time in cudaMalloc - double tot_time_taken_in_cuda_malloc_pitch_; // time in cudaMallocPitch - double tot_time_taken_in_cuda_free_; // time in cudaFree - double tot_time_taken_in_malloc_pitch_; // time in this->MallocPitch() - - - // a memory element is 'used' when it is currently possessed by the caller - // (and is not in our cache). - struct UsedMemoryElement { - size_t row_bytes; - size_t num_rows; - size_t pitch; - UsedMemoryElement() { } - UsedMemoryElement(size_t row_bytes, size_t num_rows, size_t pitch): - row_bytes(row_bytes), num_rows(num_rows), pitch(pitch) { } - }; + // Removes this block from the 'free_blocks' set of the SubRegion to which + // it belongs. This is called when allocating a block, and from other places. + void RemoveFromFreeBlocks(MemoryBlock *block); - struct PointerHasher { - size_t operator() (const void *arg) const noexcept { - // the last few bits tend to be very predictable, for alignment reasons (CUDA - // allocation may align on 256 byte or 512 byte boundaries or something similar). - size_t temp = reinterpret_cast(arg); - return (temp >> 4) + (temp >> 9); - } - }; + // Adds this block to the 'free_blocks' set of the SubRegion to which it + // belongs. This is called when freeing a block, and from other places. + void AddToFreeBlocks(MemoryBlock *block); - // This is a map from memory locations owned by the user, so we can recover - // the information when people call Free() and we add it back into the cache. - unordered_map used_map_; + // This function is called when an allocation failed and we need to try to + // allocate more memory from the evice. The 'size' is the size of the + // requested memory block whose allocation failed-- it's provided so that + // we can be sure to allocate a new region of at least this size. + void AllocateNewRegion(size_t size); + + // Called from AllocateNewRegion(), this ensures that the subregions are + // sorted as we want (which is a kind of heuristic that will be discussed in + // the code), and it also recomputes the largest_free_block_ array. + void SortSubregions(); - // this is only locked by the '*Locking' versions of the functions. - std::mutex mutex_; + + CuAllocatorOptions opts_; + + std::vector memory_regions_; + + std::vector subregions_; + + // For each SubRegion in sub_regions_, this vector gives us the size of the + // largest free block present in that SubRegion, which is equal to + // sub_regions_[i]->free_blocks.begin()->first. It allows us to fairly + // efficiently find the lowest-numbered SubRegion which can handle a + // particular request for memory. + std::vector largest_free_block_; + + size_t t_; // time counter, incremented with each call. + size_t synchronize_gpu_t_; // value of t_ at the last time we called + // SynchronizeGpu(). + size_t num_synchronizations_; // number of times we called SynchronizeGpu() + double tot_time_taken_; // Total time taken in calls to this object. + double malloc_time_taken_; // Total time we spent calling cudaMalloc(). + + // This is a map from memory locations currently owned by the user, to the + // MemoryBlock which stores the information about that location. + std::unordered_map allocated_block_map_; + + // this is only locked by the '*Locking' versions of the functions (necessary only + // in multi-threaded applications). + std::mutex mutex_; }; -} // namespace +// This function returns some printable information about the memory used +// as a string: an example showing the format is: +// "free: 10M, used: 490M, total: 500M: free/total: 0.02" +// In addition, if the pointers 'free' and 'total' are non-NULL, it will +// output to them the free memory and the total memory of the device. +std::string GetFreeGpuMemory(int64* free, int64* total); + +extern CuMemoryAllocator g_cuda_allocator; + +} // namespace kaldi #endif // HAVE_CUDA diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc index c5114ed8b22..b8d6e7edbf5 100644 --- a/src/cudamatrix/cu-device.cc +++ b/src/cudamatrix/cu-device.cc @@ -23,7 +23,6 @@ #if HAVE_CUDA == 1 - #include #include #include @@ -42,23 +41,15 @@ #include "base/kaldi-utils.h" #include "util/common-utils.h" #include "util/kaldi-io.h" +// the following is for cuda_legacy_noop(). +#include "cudamatrix/cu-kernels-ansi.h" namespace kaldi { -/** - This function was added by Dan in July 2015 after upgrading on the CLSP - cluster to the CUDA 7.0 toolkit; the old mechanism of just calling - cudaThreadSynchronize() [==cudaDeviceSynchronize()] and having it - automagically select a GPU (when exclusive mode is on) doesn't seem to work - any more, in situations where GPU 0 is already being used. This works. It's - not 100% clear if the fact that the old code wasn't working was a bug, or a - changed feature (the NVidia docs were never super-clear regarding device - initialization). But regardless, changing to this new mechanism should be - harmless even if the problem was specific to the CLSP grid. -*/ - +/// This function attempts to get a CUDA device context on some available device +/// by doing 'cudaFree(0)'. If it succeeds it returns true; if it fails, it +/// outputs some debugging information into 'debug_str' and returns false. static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { - // Our first attempt to get a device context is: we do cudaFree(0) and see if // that returns no error code. If it succeeds then we have a device // context. Apparently this is the canonical way to get a context. @@ -88,53 +79,68 @@ static bool GetCudaContext(int32 num_gpus, std::string *debug_str) { return false; } -/** - * SelectGpuId(use_gpu) - * - * There are 3 'use_gpu' modes for GPU selection: - * "yes" -- Select GPU automatically (or get one by exclusive mode) - * and die if this fails. - * "optional" -- Do as above, but if it fails, back off to CPU. - * "no" -- Run on CPU. - * - * In case of Compute exclusive mode, the GPU is selected by OS. - * - * Otherwise GPU selection is based on largest proportion of free memory. - * This can eventually lead to multiple processes computing on single GPU, - * which is slow. More practical is to use "compute exclusive mode". - * - * This method is to be called at the very beginning of the program - * (before first allocation in cudamatrix), or not at all (default to CPU). - * - */ + +void CuDevice::Initialize() { + // This function may be called in the following two situations: + // + // (1) in the main thread, only when a GPU is not currently being used, either + // within a call like CuDevice()::Instantiate().SelectGpuId(..) + // (where the Instantiate() call will call Initialize() before SelectGpuId() + // is called, just because of how Instantiate() works), or in a call + // to 'CuDevice::Instantiate().Enabled()'. In this case it will just + // set initialized_ to true and notice that device_id_ == 1, and do nothing. + // + // (2) in threads created by the user, as soon as someone calls something that + // might potentially use the GPU, via CuDevice()::Instantiate(). + // If device_id_ is >= 0, this will create the cuBLAS and cuSparse handles. + KALDI_ASSERT(!initialized_); + initialized_ = true; + if (device_id_ == -1) { + // There is nothing to do; we are not using a GPU. + return; + } else { + if (!multi_threaded_) { + multi_threaded_ = true; + KALDI_WARN << "For multi-threaded code that might use GPU, you should call " + "CuDevice()::Instantiate().AllowMultithreading() at the start of " + "the program."; + } + device_id_copy_ = device_id_; + cudaSetDevice(device_id_); + // Initialize CUBLAS. + CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_)); + // Initialize the cuSPARSE library + CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_)); + + } +} + void CuDevice::SelectGpuId(std::string use_gpu) { - // Possible modes + if (device_id_ != -1) { + KALDI_ERR << "You cannot call SelectGpuId twice if, on the first time, " + "you requested a GPU."; + } if (use_gpu != "yes" && use_gpu != "no" && use_gpu != "optional" && use_gpu != "wait") { KALDI_ERR << "Please choose : --use-gpu=yes|no|optional|wait, passed '" << use_gpu << "'"; } - - // Make sure this function is not called twice! - if (Enabled()) { - KALDI_ERR << "There is already an active GPU " << active_gpu_id_ - << ", cannot change it on the fly!"; - } - // Allow the GPU to stay disabled - if (!Enabled() && use_gpu == "no") { + if (use_gpu == "no") { KALDI_LOG << "Manually selected to compute on CPU."; return; } - // Check that we have a gpu available int32 num_gpus = 0; cudaError_t e = cudaGetDeviceCount(&num_gpus); + // Make sure the global allocator object has the up-to-date options. + g_cuda_allocator.SetOptions(g_allocator_options); + if (num_gpus == 0) { if (use_gpu == "yes" || use_gpu == "wait") { KALDI_CUDA_ERR(e, "No CUDA GPU detected!"); } if (use_gpu == "optional") { - KALDI_WARN << "Running on CPU!!! No CUDA GPU detected..."; + KALDI_WARN << "No CUDA GPU detected; running on CPU since --use-gpu=optional specified."; return; } } @@ -183,8 +189,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) { << " seconds before creating CUDA context"; } - // Re-assure we have the context - KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize()); + // Double check that we have the context + KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize()); // Check if the machine use compute exclusive mode if (IsComputeExclusive()) { @@ -196,7 +202,7 @@ void CuDevice::SelectGpuId(std::string use_gpu) { KALDI_WARN << "Not in compute-exclusive mode. Suggestion: use " "'nvidia-smi -c 3' to set compute exclusive mode"; // We want to choose the device more carefully, so release the CUDA context. - e = cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset + e = cudaDeviceReset(); if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to release CUDA context on a GPU"); } @@ -206,8 +212,8 @@ void CuDevice::SelectGpuId(std::string use_gpu) { FinalizeActiveGpu(); return; } else { - // Could not get GPU, after prevously having the CUDA context? - // Strange but not impossible... + // We could not get a GPU the second time, after prevously having the CUDA + // context. Strange but not impossible. if (use_gpu == "yes") { KALDI_ERR << "Error acquiring GPU."; } @@ -221,37 +227,38 @@ void CuDevice::SelectGpuId(std::string use_gpu) { void CuDevice::FinalizeActiveGpu() { - // The device at this point should have active GPU, so we can query its name - // and memory stats and notify user which GPU is finally used. + // The device at this point should have an active GPU, so we can query its + // name and memory stats and notify user which GPU is being used. - // Get the device-id of active device: + // Get the device-id of the active device. { - int32 act_gpu_id; - cudaError_t e = cudaGetDevice(&act_gpu_id); + int device_id; + cudaError_t e = cudaGetDevice(&device_id); if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to get device-id of active device."); } - // Remember the id of active GPU - active_gpu_id_ = act_gpu_id; // CuDevice::Enabled() is true from now on + device_id_ = device_id; + device_id_copy_ = device_id; + initialized_ = true; // Prevent Initialize() from being called on this, + // the main thread. // Initialize CUBLAS. - CUBLAS_SAFE_CALL(cublasCreate(&handle_)); + CUBLAS_SAFE_CALL(cublasCreate(&cublas_handle_)); // Initialize the cuSPARSE library CUSPARSE_SAFE_CALL(cusparseCreate(&cusparse_handle_)); - // Notify user which GPU is finally used + // Notify the user which GPU is being userd. char name[128]; - DeviceGetName(name,128,act_gpu_id); + DeviceGetName(name,128, device_id); - CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, act_gpu_id)); + CU_SAFE_CALL(cudaGetDeviceProperties(&properties_, device_id)); - KALDI_LOG << "The active GPU is [" << act_gpu_id << "]: " << name << "\t" - << GetFreeMemory(&free_memory_at_startup_, NULL) << " version " + KALDI_LOG << "The active GPU is [" << device_id << "]: " << name << "\t" + << GetFreeGpuMemory(&free_memory_at_startup_, NULL) << " version " << properties_.major << "." << properties_.minor; } return; } - bool CuDevice::DoublePrecisionSupported() { if (!Enabled()) return true; return properties_.major > 1 || (properties_.major == 1 && properties_.minor >= 3); @@ -261,10 +268,10 @@ bool CuDevice::DoublePrecisionSupported() { bool CuDevice::IsComputeExclusive() { // assume we already have an CUDA context created - KALDI_ASSERT(cudaSuccess == cudaThreadSynchronize()); + KALDI_ASSERT(cudaSuccess == cudaDeviceSynchronize()); // get the device-id and its device-properties - int32 gpu_id = -1; + int gpu_id = -1; cudaError_t e = cudaGetDevice(&gpu_id); if (e != cudaSuccess) { KALDI_CUDA_ERR(e, "Failed to get current device"); @@ -279,11 +286,9 @@ bool CuDevice::IsComputeExclusive() { case cudaComputeModeExclusive : return true; break; -#if (CUDA_VERSION >= 4000) case cudaComputeModeExclusiveProcess : return true; break; -#endif default : // in this case we release the GPU context... return false; @@ -318,37 +323,35 @@ bool CuDevice::SelectGpuIdAuto() { switch(ret) { case cudaSuccess : { // create the CUDA context for the thread - cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize + cudaDeviceSynchronize(); // get GPU name char name[128]; DeviceGetName(name,128,n); // get GPU memory stats int64 free, total; std::string mem_stats; - mem_stats = GetFreeMemory(&free, &total); + mem_stats = GetFreeGpuMemory(&free, &total); // log KALDI_LOG << "cudaSetDevice(" << n << "): " << name << "\t" << mem_stats; - // We have seen that in some cases GetFreeMemory returns zero + // We have seen that in some cases GetFreeGpuMemory returns zero // That will produce nan after division, which might confuse // the sorting routine. Or maybe not, but let's keep it clean if (total <= 0) { - KALDI_LOG << "Total memory reported for device " << n << " is zero (or less)."; + KALDI_LOG << "Total memory reported for device " << n + << " is zero (or less)."; } float mem_ratio = total > 0 ? free/(float)total : 0; free_mem_ratio[n] = std::make_pair(n, mem_ratio); // destroy the CUDA context for the thread - cudaThreadExit(); // deprecated, but for legacy reason not cudaDeviceReset + cudaDeviceReset(); } break; - -#if (CUDA_VERSION > 3020) case cudaErrorDeviceAlreadyInUse : KALDI_LOG << "cudaSetDevice(" << n << "): " << "Device cannot be accessed, used EXCLUSIVE-THREAD mode..."; break; -#endif case cudaErrorInvalidDevice : KALDI_LOG << "cudaSetDevice(" << n << "): " << "Device cannot be accessed, not a VALID CUDA device!"; @@ -366,7 +369,7 @@ bool CuDevice::SelectGpuIdAuto() { // the free_mem_ratio should be bigger than zero KALDI_ASSERT(free_mem_ratio[max_id].second > 0.0); - float dev_id; + int dev_id; float mem_ratio; do { // try to select the GPU in the best to worst order @@ -382,7 +385,7 @@ bool CuDevice::SelectGpuIdAuto() { KALDI_WARN << "Cannot select this device: return code " << e << ", Error message: \"" << cudaGetErrorString(e) << "\""; } else { - e = cudaThreadSynchronize(); // deprecated, but for legacy not cudaDeviceSynchronize + e = cudaDeviceSynchronize(); if (e != cudaSuccess) { KALDI_WARN << "Cannot select this device: return code " << e << ", Error message: \"" << cudaGetErrorString(e) << "\""; @@ -403,10 +406,16 @@ bool CuDevice::SelectGpuIdAuto() { void CuDevice::AccuProfile(const char *function_name, const CuTimer &timer) { if (GetVerboseLevel() >= 1) { + std::unique_lock lock(profile_mutex_, std::defer_lock_t()); + if (multi_threaded_) + lock.lock(); std::string key(function_name); - cudaDeviceSynchronize(); + // by passing 0 as the stream to cudaStreamSynchronize, we are using the + // per-thread default stream. Since we compile with + // -DCUDA_API_PER_THREAD_DEFAULT_STREAM, this equates to a per-thread + // stream. + cudaStreamSynchronize(0); double elapsed = timer.Elapsed(); - if (profile_map_.find(key) == profile_map_.end()) profile_map_[key] = elapsed; else @@ -415,13 +424,8 @@ void CuDevice::AccuProfile(const char *function_name, } void CuDevice::PrintMemoryUsage() const { - if (Enabled()) { - allocator_.PrintMemoryUsage(); - int64 free_memory_now; - GetFreeMemory(&free_memory_now, NULL); - KALDI_LOG << "Memory used (according to the device): " - << (free_memory_at_startup_ - free_memory_now) << " bytes."; - } + if (Enabled()) + g_cuda_allocator.PrintMemoryUsage(); } void CuDevice::PrintProfile() { @@ -452,60 +456,6 @@ void CuDevice::PrintProfile() { } -std::string CuDevice::GetFreeMemory(int64* free, int64* total) const { - // WARNING! the CUDA API is inconsistent accross versions! -#ifdef _MSC_VER - size_t mem_free, mem_total; - cuMemGetInfo_v2(&mem_free, &mem_total); -#else -#if (CUDA_VERSION >= 3020) - // define the function signature type - size_t mem_free, mem_total; -#else - unsigned int mem_free, mem_total; -#endif - { - // we will load cuMemGetInfo_v2 dynamically from libcuda.so - // pre-fill ``safe'' values that will not cause problems - mem_free = 1; mem_total = 1; - // open libcuda.so - void* libcuda = dlopen("libcuda.so",RTLD_LAZY); - if (NULL == libcuda) { - KALDI_WARN << "cannot open libcuda.so"; - } else { - // define the function signature type - // and get the symbol -#if (CUDA_VERSION >= 3020) - typedef CUresult (*cu_fun_ptr)(size_t*, size_t*); - cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo_v2"); -#else - typedef CUresult (*cu_fun_ptr)(int*, int*); - cu_fun_ptr dl_cuMemGetInfo = (cu_fun_ptr)dlsym(libcuda,"cuMemGetInfo"); -#endif - if (NULL == dl_cuMemGetInfo) { - KALDI_WARN << "cannot load cuMemGetInfo from libcuda.so"; - } else { - // call the function - dl_cuMemGetInfo(&mem_free, &mem_total); - } - // close the library - dlclose(libcuda); - } - } -#endif - // copy the output values outside - if (NULL != free) *free = mem_free; - if (NULL != total) *total = mem_total; - // prepare the text output - std::ostringstream os; - os << "free:" << mem_free/(1024*1024) << "M, " - << "used:" << (mem_total-mem_free)/(1024*1024) << "M, " - << "total:" << mem_total/(1024*1024) << "M, " - << "free/total:" << mem_free/(float)mem_total; - return os.str(); -} - - void CuDevice::DeviceGetName(char* name, int32 len, int32 dev) { // prefill with something reasonable strncpy(name,"Unknown GPU",len); @@ -554,15 +504,48 @@ void CuDevice::CheckGpuHealth() { AccuProfile(__func__, t); } -CuDevice::CuDevice() : - active_gpu_id_(-1), debug_stride_mode_(false), - num_debug_stride_allocations_(0), allocator_(CuAllocatorOptions()), - multi_threaded_(false) { } +CuDevice::CuDevice(): + initialized_(false), + device_id_copy_(-1), + cublas_handle_(NULL), + cusparse_handle_(NULL) { +} + + +CuDevice::~CuDevice() { + if (cublas_handle_) + CUBLAS_SAFE_CALL(cublasDestroy(cublas_handle_)); + if (cusparse_handle_) + CUSPARSE_SAFE_CALL(cusparseDestroy(cusparse_handle_)); +} + +// Each thread has its own copy of the CuDevice object. +// Note: this was declared "static". +thread_local CuDevice CuDevice::this_thread_device_; -// The instance of the static singleton -CuDevice CuDevice::global_device_; +// define and initialize the static members of the CuDevice object. +int32 CuDevice::device_id_ = -1; +bool CuDevice::multi_threaded_ = false; +unordered_map CuDevice::profile_map_; +std::mutex CuDevice::profile_mutex_; +int64 CuDevice::free_memory_at_startup_; +cudaDeviceProp CuDevice::properties_; +bool CuDevice::debug_stride_mode_ = false; + + +void SynchronizeGpu() { + cuda_legacy_noop(); + CU_SAFE_CALL(cudaGetLastError()); } +} // namespace kaldi + +#else // #if HAVE_CUDA == 1 + +namespace kaldi { +// SynchronizeGpu() does nothing if we didn't compile for GPU. +void SynchronizeGpu() { } +} -#endif // HAVE_CUDA +#endif // #if HAVE_CUDA == 1 diff --git a/src/cudamatrix/cu-device.h b/src/cudamatrix/cu-device.h index 99105355a8f..4967ccb5045 100644 --- a/src/cudamatrix/cu-device.h +++ b/src/cudamatrix/cu-device.h @@ -24,7 +24,6 @@ #define KALDI_CUDAMATRIX_CU_DEVICE_H_ #if HAVE_CUDA == 1 - #include #include #include @@ -41,61 +40,95 @@ namespace kaldi { class CuTimer; /** - * Singleton object which represents the CUDA device - * responsible for CUBLAS initilalisation, collects profiling info + This class contains code for selecting the CUDA device, initializing the + cuBLAS and cuSparse handles, and providing an interface for memory allocation + (which supports caching, to avoid the slowness of the CUDA memory allocator). + + There is a separate instance of the CuDevice object for each thread of the + program, but many of its variables are static (hence, shared between all + instances). + + We only (currently) support using a single GPU device; however, we support + multiple CUDA streams. The expected programming model here is that you will + have multiple CPU threads, and each CPU thread automatically gets its own + CUDA stream because we compile with -DCUDA_API_PER_THREAD_DEFAULT_STREAM. + + In terms of synchronizing the activities of multiple threads: The CuDevice + object (with help from the underlying CuAllocator object) ensures that the + memory caching code won't itself be a cause of synchronization problems, + i.e. you don't have to worry that when you allocate with CuDevice::Malloc(), + the memory will still be in use by another thread on the GPU. However, it + may sometimes still be necessary to synchronize the activities of multiple + streams by calling the function SynchronizeGpu()-- probably right before a + thread increments a semaphore, right after it waits on a semaphore, or + right after it acquires a mutex, or something like that. + */ class CuDevice { - // Singleton object (there should only be one instantiated per program) public: - static inline CuDevice& Instantiate() { return global_device_; } - inline cublasHandle_t GetHandle() { return handle_; } + // You obtain the CuDevice for the current thread by calling + // CuDevice::Instantiate() + // At the beginning of the program, if you want to use a GPU, you + // should call CuDevice::Instantiate().SelectGpuId(..). + static inline CuDevice& Instantiate() { + CuDevice &ans = this_thread_device_; + if (!ans.initialized_) + ans.Initialize(); + return ans; + } + + inline cublasHandle_t GetCublasHandle() { return cublas_handle_; } inline cusparseHandle_t GetCusparseHandle() { return cusparse_handle_; } - // We provide functions Malloc, MallocPitch and Free which replace cudaMalloc, - // cudaMallocPitch and cudaFree. Their function is to cache the results of - // previous allocations to avoid the very large overhead that CUDA's - // allocation seems to give for some setups. + // We provide functions Malloc(), MallocPitch() and Free() which replace + // cudaMalloc(), cudaMallocPitch() and cudaFree(). Their function is to cache + // the results of previous allocations to avoid the very large overhead that + // CUDA's allocation seems to give for some setups. inline void* Malloc(size_t size) { - return multi_threaded_ ? allocator_.MallocLocking(size) : - allocator_.Malloc(size); + return multi_threaded_ ? g_cuda_allocator.MallocLocking(size) : + g_cuda_allocator.Malloc(size); } inline void* MallocPitch(size_t row_bytes, size_t num_rows, size_t *pitch) { if (multi_threaded_) { - return allocator_.MallocPitchLocking(row_bytes, num_rows, pitch); + return g_cuda_allocator.MallocPitchLocking(row_bytes, num_rows, pitch); } else if (debug_stride_mode_) { // The pitch bucket size is hardware dependent. // It is 512 on K40c with CUDA 7.5 // "% 8" ensures that any 8 adjacent allocations have different pitches // if their original pitches are same in the normal mode. - return allocator_.MallocPitch( - row_bytes + 512 * ((num_debug_stride_allocations_++) % 8), num_rows, + return g_cuda_allocator.MallocPitch( + row_bytes + 512 * RandInt(0, 4), num_rows, pitch); } else { - return allocator_.MallocPitch(row_bytes, num_rows, pitch); + return g_cuda_allocator.MallocPitch(row_bytes, num_rows, pitch); } } + inline void Free(void *ptr) { - if (multi_threaded_) allocator_.FreeLocking(ptr); - else allocator_.Free(ptr); + if (multi_threaded_) g_cuda_allocator.FreeLocking(ptr); + else g_cuda_allocator.Free(ptr); } - /// Select a GPU for computation, the 'use_gpu' modes are: - /// "yes" -- Select GPU automatically and die if this fails. + /// Select a GPU for computation. You are supposed to call this function just + /// once, at the beginning of the program (from the main thread), or not at + /// all. + /// The 'use_gpu' modes are: + /// "yes" -- Select GPU automatically and die if this fails. If you have set + /// the GPUs to exclusive mode it will select one + /// pseudo-randomly; otherwise it will choose whichever one has + /// the most free memory (but we recommend to set GPUs to + /// exclusive mode, or controlling which GPU to use by setting + /// the variable CUDA_VISIBLE_DEVICES to the id of the GPU you + /// want the program to use. /// "optional" -- Do as above, but if it fails, back off to CPU. /// "no" -- Run on CPU. - /// (more comments in cu-device.cc) void SelectGpuId(std::string use_gpu); /// Check if the CUDA GPU is selected for use bool Enabled() const { - return (active_gpu_id_ > -1); - } - - /// Get the active GPU id - int32 ActiveGpuId() { - return active_gpu_id_; + return (device_id_ > -1); } /// Returns true if either we have no GPU, or we have a GPU @@ -106,21 +139,19 @@ class CuDevice { /// are printed out when you call PrintProfile(). However, /// it only does something if VerboseLevel() >= 1. void AccuProfile(const char *function_name, const CuTimer &timer); + + /// Print some profiling information using KALDI_LOG. void PrintProfile(); + /// Print some memory-usage information using KALDI_LOG. void PrintMemoryUsage() const; /// The user should call this if the program plans to access the GPU (e.g. via /// using class CuMatrix) from more than one thread. If you fail to call this - /// for a multi-threaded program, it will occasionally segfault. + /// for a multi-threaded program, it may occasionally segfault (and also + /// the code will detect that you failed to call it, and will print a warning). inline void AllowMultithreading() { multi_threaded_ = true; } - void ResetProfile() { - profile_map_.clear(); - } - - /// Get the actual GPU memory use stats - std::string GetFreeMemory(int64* free = NULL, int64* total = NULL) const; /// Get the name of the GPU void DeviceGetName(char* name, int32 len, int32 dev); @@ -153,22 +184,33 @@ class CuDevice { /// (i.e. from outside the class), call this only if Enabled() returns true. bool IsComputeExclusive(); - private: CuDevice(); + + ~CuDevice(); + private: CuDevice(CuDevice&); // Disallow. CuDevice &operator=(CuDevice&); // Disallow. - static CuDevice global_device_; - cublasHandle_t handle_; - cusparseHandle_t cusparse_handle_; + /// The Initialize() function exists to do the following, in threads other + /// than the main thread, and only if we are using a GPU: call + /// cudaSetDevice(), and set up cublas_handle_ and cusparse_handle_. It does + /// get called in the main thread (see documentation by its definition), but + /// does nothing interesting there. + void Initialize(); - /// Automatically select GPU and get CUDA context. Returns true on success. + /// Automatically select GPU and get CUDA context (this is only called, from + /// SelectGpuId(), if the GPUs are in non-exclusive mode). Returns true on + /// success. bool SelectGpuIdAuto(); - /// Try to get CUDA context on manually selected GPU. Return true on success. - bool SelectGpuIdManual(int32 gpu_id); - + /// This function, called from SelectGpuId(), is to be called when a + /// GPU context corresponding to the GPU we want to use exists; it + /// works out the device-id, creates the cuBLAS and cuSparse handles, + /// and prints out some information that's useful for debugging. + /// It also sets initialized_ to true, to suppress Initialize() from + /// being called on this, the main thread, in future, since + /// that would try to create the handles again. void FinalizeActiveGpu(); /// Should only be called if Enabled() == true. @@ -177,29 +219,58 @@ class CuDevice { /// Should only be called if Enabled() == true. int32 MinorDeviceVersion(); - unordered_map profile_map_; - /// active_gpu_id_ values: - /// -3 default (default, the SelectGpuId was not called, we did not want to use GPU) - /// -2 SelectGpuId was called, but no GPU was present - /// -1 SelectGpuId was called, but the GPU was manually disabled - /// 0..N Normal GPU IDs - int32 active_gpu_id_; + // Each thread has its own CuDevice object, which contains the cublas and + // cusparse handles. These are unique to the thread (which is what is + // recommended by NVidia). + static thread_local CuDevice this_thread_device_; + + // The GPU device-id that we are using. This will be initialized to -1, and will + // be set when the user calls + // CuDevice::Instantiate::SelectGpuId(...) + // from the main thread. Background threads will, when spawned and when + // CuDevice::Instantiate() is called from them the first time, will + // call cudaSetDevice(device_id)) + static int32 device_id_; - int64 free_memory_at_startup_; + // This will automatically be set to true if the application has multiple + // threads that access the GPU device. It is used to know whether to + // use locks when accessing the allocator and the profiling-related code. + static bool multi_threaded_; - cudaDeviceProp properties_; + // The variable profile_map_ will only be used if the verbose level is >= 1; + // it will accumulate some function-level timing information that is printed + // out at program end. This makes things a bit slower as we have to call + // cudaDeviceSynchronize() to make the timing information meaningful. + static unordered_map profile_map_; + // profile_mutex_ guards profile_map_ in case multi_threaded_ is true. + static std::mutex profile_mutex_; - // there used to be a 'bool verbose_' here. I'm leaving a placeholder here - // instead of removing it because it causes particularly hard-to-debug errors - // if compilation is not done right (e.g. make depend was not done), and this - // class's members move about. - bool unused_; - bool debug_stride_mode_; - uint32 num_debug_stride_allocations_; + // free_memory_at_startup_ is just used in printing the memory used according + // to the device. + static int64 free_memory_at_startup_; + static cudaDeviceProp properties_; + + // If set to true by SetDebugStrideMode(), code will be activated to use + // pseudo-random stride values when allocating data (to detect errors which + // otherwise would be rare). + static bool debug_stride_mode_; + + + // The following member variable is initialized to false; if the user calls + // Instantiate() in a thread where it is still false, Initialize() will be + // called, in order to -- if a GPU is being used-- call cudaSetDevice() and + // set up the cublas and cusparse handles. + bool initialized_; + + // This variable is just a copy of the static variable device_id_. It's used + // to detect when this code is called in the wrong way. + int32 device_id_copy_; + + cublasHandle_t cublas_handle_; + + cusparseHandle_t cusparse_handle_; - CuMemoryAllocator allocator_; - bool multi_threaded_; // true if user called AllowMultithreading(). }; // class CuDevice @@ -214,13 +285,38 @@ class CuTimer: public Timer { // This function is declared as a more convenient way to get the CUDA device handle for use // in the CUBLAS v2 API, since we so frequently need to access it. -inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetHandle(); } +inline cublasHandle_t GetCublasHandle() { return CuDevice::Instantiate().GetCublasHandle(); } // A more convenient way to get the handle to use cuSPARSE APIs. inline cusparseHandle_t GetCusparseHandle() { return CuDevice::Instantiate().GetCusparseHandle(); } -} // namespace + +} // namespace kaldi #endif // HAVE_CUDA -#endif +namespace kaldi { + +/** + The function SynchronizeGpu(), which for convenience is defined whether or + not we have compiled for CUDA, is intended to be called in places where threads + need to be synchronized. + + It just launches a no-op kernel into the legacy default stream. This will + have the effect that it will run after any kernels previously launched from + any stream(*), and before kernels that will later be launched from any stream(*). + (*) does not apply to non-blocking streams. + + Note: at the time of writing we never call SynchronizeGpu() from binary-level + code because it hasn't become necessary yet; the only program that might have + multiple threads actually using the GPU is rnnlm-train (if the user were to + invoke it with the ,bg option for loading training examples); but the only + CUDA invocation the RnnlmExample::Read() function uses (via + CuMatrix::Read()), is cudaMemcpy, which is synchronous already. + +*/ +void SynchronizeGpu(); + +} // namespace kaldi + +#endif // KALDI_CUDAMATRIX_CU_DEVICE_H_ diff --git a/src/cudamatrix/cu-kernels-ansi.h b/src/cudamatrix/cu-kernels-ansi.h index ebbcb9da5ff..a61bb601e8e 100644 --- a/src/cudamatrix/cu-kernels-ansi.h +++ b/src/cudamatrix/cu-kernels-ansi.h @@ -790,6 +790,10 @@ void cuda_uncompress_uint8(dim3 Gr, dim3 Bl, BaseFloat *dest, MatrixDim dim, const uint8_t *src, int src_stride, float scale); +// Launches a kernel that does nothing, explicitly using the legacy default stream; +// this will synchronize all CUDA streams (except for non-blocking streams) on the +// device. +void cuda_legacy_noop(); } // extern "C" diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 4101d5ba52f..5a5307b9f87 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -3699,7 +3699,9 @@ static void _cuda_uncompress(BaseFloat *dest, MatrixDim dim, } } - +__global__ +static void _noop_kernel() { +} /*********************************************************************** * ANSI-C wrappers of CUDA kernels @@ -5459,3 +5461,10 @@ void cuda_uncompress_int16(dim3 Gr, dim3 Bl, BaseFloat *dest, int src_stride, float scale) { _cuda_uncompress<<>>(dest, dim, src, src_stride, scale); } + + +// Launches a kernel that does nothing, explicitly using the legacy default stream; +// this will synchronize all threads without blocking. +void cuda_legacy_noop() { + _noop_kernel<<<1, 1, 0, cudaStreamLegacy>>>(); +} diff --git a/src/cudamatrix/cu-matrix-test.cc b/src/cudamatrix/cu-matrix-test.cc index 01030bb8353..ba91f65e484 100644 --- a/src/cudamatrix/cu-matrix-test.cc +++ b/src/cudamatrix/cu-matrix-test.cc @@ -2620,7 +2620,11 @@ static int32 DoubleFactorial(int32 i) { template static void UnitTestCuMatrixSetRandn() { - { // First test consistency when called twice. + + if (false) { + // This block tests consistency when called twice. + // It has been disabled since we added multi-threaded testing, + // since consistency wouldn't be expected if other threads were running. int32 dimM = 100 + Rand() % 200, dimN = 100 + Rand() % 200; Matrix M(dimM, dimN), N(dimM, dimN); srand(104); @@ -3040,16 +3044,38 @@ template void CudaMatrixUnitTest() { int main() { SetVerboseLevel(1); int32 loop = 0; + bool test_threads = true; + // num_threads only matters if test_threads == true. Don't make it + // to large, because it will affect CPU usage if you are using CPU. + int32 num_threads = 4; + + #if HAVE_CUDA == 1 for (loop = 0; loop < 2; loop++) { CuDevice::Instantiate().SetDebugStrideMode(true); + if (test_threads) + CuDevice::Instantiate().AllowMultithreading(); if (loop == 0) CuDevice::Instantiate().SelectGpuId("no"); else CuDevice::Instantiate().SelectGpuId("yes"); #endif - kaldi::CudaMatrixUnitTest(); + if (test_threads) { + KALDI_LOG << "Doing matrix unit test with " + << num_threads << " threads."; + std::vector threads; + for (int32 i = 0; i < num_threads - 1; i++) + threads.push_back(new std::thread(kaldi::CudaMatrixUnitTest)); + // the last thread running is the main thread. + kaldi::CudaMatrixUnitTest(); + for (size_t i = 0; i < threads.size(); i++) { + threads[i]->join(); + delete threads[i]; + } + } else { + kaldi::CudaMatrixUnitTest(); + } #if HAVE_CUDA == 1 if (CuDevice::Instantiate().DoublePrecisionSupported()) { diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile index 39e4ae39bcc..644eb639381 100644 --- a/src/fstbin/Makefile +++ b/src/fstbin/Makefile @@ -15,7 +15,7 @@ BINFILES = fstdeterminizestar \ fstmakecontextsyms fstaddsubsequentialloop fstaddselfloops \ fstrmepslocal fstcomposecontext fsttablecompose fstrand \ fstdeterminizelog fstphicompose fstcopy \ - fstpushspecial fsts-to-transcripts fsts-project fsts-union + fstpushspecial fsts-to-transcripts fsts-project fsts-union fsts-concat OBJFILES = diff --git a/src/fstbin/fsts-concat.cc b/src/fstbin/fsts-concat.cc new file mode 100644 index 00000000000..2a217eda7dc --- /dev/null +++ b/src/fstbin/fsts-concat.cc @@ -0,0 +1,112 @@ +// fstbin/fsts-concat.cc + +// Copyright 2016 Johns Hopkins University (Authors: Jan "Yenda" Trmal) +// 2018 Soapbox Labs (Author: Karel Vesely) + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "fstext/fstext-utils.h" +#include "fstext/kaldi-fst-io.h" + + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace fst; + typedef kaldi::int32 int32; + typedef kaldi::uint64 uint64; + + const char *usage = + "Reads kaldi archives with FSTs. Concatenates the fsts from all the rspecifiers.\n" + "The fsts to concatenate must have same key. The sequencing is given by the position of arguments.\n" + "\n" + "Usage: fsts-concat [options] ... \n" + " e.g.: fsts-concat scp:fsts1.scp scp:fsts2.scp ... ark:fsts_out.ark\n" + "\n" + "see also: fstconcat (from the OpenFst toolkit)\n"; + + ParseOptions po(usage); + + po.Read(argc, argv); + + if (po.NumArgs() < 3) { + po.PrintUsage(); + exit(1); + } + + std::string fsts_rspecifier = po.GetArg(1), + fsts_wspecifier = po.GetArg(po.NumArgs()); + + SequentialTableReader fst_reader(fsts_rspecifier); + std::vector*> fst_readers; + TableWriter fst_writer(fsts_wspecifier); + + for (int32 i = 2; i < po.NumArgs(); i++) + fst_readers.push_back(new RandomAccessTableReader(po.GetArg(i))); + const int32 num_fst_readers = fst_readers.size(); + + int32 n_done = 0, + n_skipped = 0; + + for (; !fst_reader.Done(); fst_reader.Next()) { + std::string key = fst_reader.Key(); + + // Check that the key exists in all 'fst_readers'. + bool skip_key = false; + for (int32 i = 0; i < num_fst_readers; i++) { + if (!fst_readers[i]->HasKey(key)) { + KALDI_WARN << "Skipping '" << key << "'" + << " due to missing the fst in " << (i+2) << "th : " + << "'" << po.GetArg(i+2) << "'"; + skip_key = true; + } + } + if (skip_key) { + n_skipped++; + continue; + } + + // Concatenate! + VectorFst fst_out = fst_readers.back()->Value(key); + // Loop from (last-1) to first, as 'prepending' the fsts is faster, + // see: http://www.openfst.org/twiki/bin/view/FST/ConcatDoc + for (int32 i = num_fst_readers-2; i >= 0; i--) { + fst::Concat(fst_readers[i]->Value(key), &fst_out); + } + // Finally, prepend the fst from the 'Sequential' reader. + fst::Concat(fst_reader.Value(), &fst_out); + + // Write the output. + fst_writer.Write(key, fst_out); + n_done++; + } + + // Cleanup. + for (int32 i = 0; i < num_fst_readers; i++) + delete fst_readers[i]; + fst_readers.clear(); + + KALDI_LOG << "Produced " << n_done << " FSTs by concatenating " << po.NumArgs()-1 + << " streams " << "(" << n_skipped << " keys skipped)."; + return (n_done != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what(); + return -1; + } +} diff --git a/src/ivectorbin/ivector-compute-plda.cc b/src/ivectorbin/ivector-compute-plda.cc index 2e874adcca6..c955f07bd68 100644 --- a/src/ivectorbin/ivector-compute-plda.cc +++ b/src/ivectorbin/ivector-compute-plda.cc @@ -85,6 +85,7 @@ int main(int argc, char *argv[]) { num_utt_done++; } } + if (ivectors.size() == 0) { KALDI_WARN << "Not producing output for speaker " << spk << " since no utterances had iVectors"; @@ -101,6 +102,10 @@ int main(int argc, char *argv[]) { } } + if (num_utt_done <= plda_stats.Dim()) + KALDI_ERR << "Number of training iVectors is not greater than their " + << "dimension, unable to estimate PLDA."; + KALDI_LOG << "Accumulated stats from " << num_spk_done << " speakers (" << num_spk_err << " with no utterances), consisting of " << num_utt_done << " utterances (" << num_utt_err diff --git a/src/ivectorbin/ivector-plda-scoring-dense.cc b/src/ivectorbin/ivector-plda-scoring-dense.cc index 076fd41ad09..73ca879e6bc 100644 --- a/src/ivectorbin/ivector-plda-scoring-dense.cc +++ b/src/ivectorbin/ivector-plda-scoring-dense.cc @@ -194,8 +194,8 @@ int main(int argc, char *argv[]) { } else { KALDI_WARN << "Unable to compute conversation dependent PCA for" << " recording " << reco << "."; - ivector_mat_pca.Resize(ivector_mat.NumRows(), ivector_mat.NumCols()); - ivector_mat_pca.CopyFromMat(ivector_mat); + TransformIvectors(ivector_mat, plda_config, this_plda, + &ivector_mat_plda); } for (int32 i = 0; i < ivector_mat_plda.NumRows(); i++) { for (int32 j = 0; j < ivector_mat_plda.NumRows(); j++) { diff --git a/src/makefiles/cuda_32bit.mk b/src/makefiles/cuda_32bit.mk index 9a343d1ae24..f6ddfb6d80f 100644 --- a/src/makefiles/cuda_32bit.mk +++ b/src/makefiles/cuda_32bit.mk @@ -7,7 +7,8 @@ endif CUDA_INCLUDE= -I$(CUDATKDIR)/include CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA \ - -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) + -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \ + -DCUDA_API_PER_THREAD_DEFAULT_STREAM CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule diff --git a/src/makefiles/cuda_64bit.mk b/src/makefiles/cuda_64bit.mk index be76798b1d3..6a428e7391f 100644 --- a/src/makefiles/cuda_64bit.mk +++ b/src/makefiles/cuda_64bit.mk @@ -7,7 +7,8 @@ endif CUDA_INCLUDE= -I$(CUDATKDIR)/include CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA \ - -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) + -ccbin $(CXX) -DKALDI_DOUBLEPRECISION=$(DOUBLE_PRECISION) \ + -DCUDA_API_PER_THREAD_DEFAULT_STREAM CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64 CUDA_LDLIBS += -lcublas -lcusparse -lcudart -lcurand #LDLIBS : The libs are loaded later than static libs in implicit rule diff --git a/src/nnet3/natural-gradient-online.cc b/src/nnet3/natural-gradient-online.cc index b5740053f46..0677e1ca474 100644 --- a/src/nnet3/natural-gradient-online.cc +++ b/src/nnet3/natural-gradient-online.cc @@ -28,7 +28,7 @@ OnlineNaturalGradient::OnlineNaturalGradient(): rank_(40), update_period_(1), num_samples_history_(2000.0), num_minibatches_history_(0.0), alpha_(4.0), epsilon_(1.0e-10), delta_(5.0e-04), frozen_(false), t_(0), - self_debug_(false) { } + self_debug_(false), rho_t_(-1.0e+10) { } /** @@ -623,6 +623,21 @@ void OnlineNaturalGradient::SetAlpha(BaseFloat alpha) { alpha_ = alpha; } - -} +void OnlineNaturalGradient::Swap(OnlineNaturalGradient *other) { + std::swap(rank_, other->rank_); + std::swap(update_period_, other->update_period_); + std::swap(num_samples_history_, other->num_samples_history_); + std::swap(num_minibatches_history_, other->num_minibatches_history_); + std::swap(alpha_, other->alpha_); + std::swap(epsilon_, other->epsilon_); + std::swap(delta_, other->delta_); + std::swap(frozen_, other->frozen_); + std::swap(t_, other->t_); + std::swap(self_debug_, other->self_debug_); + W_t_.Swap(&(other->W_t_)); + std::swap(rho_t_, other->rho_t_); + d_t_.Swap(&(other->d_t_)); } + +} // namespace nnet3 +} // namespace kaldi diff --git a/src/nnet3/natural-gradient-online.h b/src/nnet3/natural-gradient-online.h index b49769da540..a68ad9bbb53 100644 --- a/src/nnet3/natural-gradient-online.h +++ b/src/nnet3/natural-gradient-online.h @@ -466,6 +466,9 @@ class OnlineNaturalGradient { explicit OnlineNaturalGradient(const OnlineNaturalGradient &other); // Assignent operator OnlineNaturalGradient &operator = (const OnlineNaturalGradient &other); + + // Shallow swap + void Swap(OnlineNaturalGradient *other); private: diff --git a/src/nnet3/nnet-chain-training.cc b/src/nnet3/nnet-chain-training.cc index 2ec2699ec97..87eacf75327 100644 --- a/src/nnet3/nnet-chain-training.cc +++ b/src/nnet3/nnet-chain-training.cc @@ -88,7 +88,10 @@ void NnetChainTrainer::Train(const NnetChainExample &chain_eg) { } else { // conventional training TrainInternal(chain_eg, *computation); } - + if (num_minibatches_processed_ == 0) { + ConsolidateMemory(nnet_); + ConsolidateMemory(delta_nnet_); + } num_minibatches_processed_++; } diff --git a/src/nnet3/nnet-common.cc b/src/nnet3/nnet-common.cc index 31ff9819dfa..e4bd1a402eb 100644 --- a/src/nnet3/nnet-common.cc +++ b/src/nnet3/nnet-common.cc @@ -392,11 +392,12 @@ size_t IndexVectorHasher::operator () ( // skipping over more elements. Setting n1 large or // n2 to 1 would make the hasher consider all // elements. + size_t len = index_vector.size(); // all long-ish numbers appearing below are randomly chosen primes. - size_t ans = 1433 + 34949 * index_vector.size(); + size_t ans = 1433 + 34949 * len; std::vector::const_iterator iter = index_vector.begin(), end = index_vector.end(), med = end; - if (med > iter + n1) + if (n1 < len) med = iter + n1; for (; iter != med; ++iter) { @@ -412,6 +413,10 @@ size_t IndexVectorHasher::operator () ( ans += iter->n * 1619; ans += iter->t * 15649; ans += iter->x * 89809; + // The following if-statement was introduced in order to fix an + // out-of-range iterator problem on Windows. + if (n2 > len || iter >= end - n2) + break; } return ans; } diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index 2c76805f5cc..d2d325d22f1 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -367,8 +367,10 @@ void NonlinearComponent::StoreStatsInternal( void NonlinearComponent::StoreBackpropStats( const CuMatrixBase &out_deriv) { - // only store these stats about every 4 minibatches. - if (RandInt(0, 3) == 0) + // Only store these stats about every 4 minibatches. Make sure to always + // store the stats on the very first minibatch, or it would interact badly + // with the ConsolidateMemory() code. + if (RandInt(0, 3) == 0 && oderiv_count_ != 0) return; KALDI_ASSERT(out_deriv.NumCols() == dim_); @@ -622,7 +624,11 @@ void NonlinearComponent::InitFromConfig(ConfigLine *cfl) { << Type() << ": \"" << cfl->WholeLine() << "\""; } - +void NonlinearComponent::ConsolidateMemory() { + { CuVector temp(value_sum_); value_sum_.Swap(&temp); } + { CuVector temp(deriv_sum_); deriv_sum_.Swap(&temp); } + { CuVector temp(oderiv_sumsq_); oderiv_sumsq_.Swap(&temp); } +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 01697353308..32d6b3d305d 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -375,6 +375,23 @@ class Component { /// backprop to consume it. virtual void DeleteMemo(void *memo) const { KALDI_ASSERT(memo == NULL); } + /// This virtual function relates to memory management, and avoiding + /// fragmentation. It is called only once per model, after we do the first + /// minibatch of training. The default implementation does nothing, but it + /// can be overridden by child classes, where it may re-initialize certain + /// quantities that may possibly have been allocated during the forward pass + /// (e.g. certain statistics; OnlineNaturalGradient objects). We use our own + /// CPU-based allocator (see cu-allocator.h) and since it can't do paging + /// since we're not in control of the GPU page table, fragmentation can be a + /// problem. The allocator always tries to put things in 'low-address memory' + /// (i.e. at smaller memory addresses) near the beginning of the block it + /// allocated, to avoid fragmentation; but if permanent things (belonging to + /// the model) are allocated in the forward pass, they can permanently stay in + /// high memory. This function helps to prevent that, by re-allocating those + /// things into low-address memory (It's important that it's called after all the + /// temporary buffers for the forward-backward have been freed, so that there + /// is low-address memory available)). + virtual void ConsolidateMemory() { } Component() { } @@ -620,6 +637,8 @@ class NonlinearComponent: public Component { virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); + virtual void ConsolidateMemory(); + // The following functions are unique to NonlinearComponent. // They mostly relate to diagnostics. const CuVector &ValueSum() const { return value_sum_; } diff --git a/src/nnet3/nnet-convolutional-component.cc b/src/nnet3/nnet-convolutional-component.cc index f48a3968c88..7a1617f261a 100644 --- a/src/nnet3/nnet-convolutional-component.cc +++ b/src/nnet3/nnet-convolutional-component.cc @@ -665,6 +665,12 @@ void TimeHeightConvolutionComponent::PrecomputedIndexes::Read( ExpectToken(is, binary, ""); } +void TimeHeightConvolutionComponent::ConsolidateMemory() { + OnlineNaturalGradient temp_in(preconditioner_in_); + preconditioner_in_.Swap(&temp_in); + OnlineNaturalGradient temp_out(preconditioner_out_); + preconditioner_out_.Swap(&temp_out); +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-convolutional-component.h b/src/nnet3/nnet-convolutional-component.h index e107962abc2..279cec321dd 100644 --- a/src/nnet3/nnet-convolutional-component.h +++ b/src/nnet3/nnet-convolutional-component.h @@ -300,6 +300,8 @@ class TimeHeightConvolutionComponent: public UpdatableComponent { }; void ScaleLinearParams(BaseFloat alpha) { linear_params_.Scale(alpha); } + + void ConsolidateMemory(); private: void Check() const; @@ -556,6 +558,8 @@ class TdnnComponent: public UpdatableComponent { CuVector &BiasParams() { return bias_params_; } BaseFloat OrthonormalConstraint() const { return orthonormal_constraint_; } + + void ConsolidateMemory(); private: // This static function is a utility function that extracts a CuSubMatrix diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 00a31fa897c..48cc0112368 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -1385,7 +1385,10 @@ void ConstantComponent::UnVectorize(const VectorBase ¶ms) { output_.CopyFromVec(params); } - +void ConstantComponent::ConsolidateMemory() { + OnlineNaturalGradient temp(preconditioner_); + preconditioner_.Swap(&temp); +} std::string DropoutMaskComponent::Info() const { std::ostringstream stream; diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index cff73a55b59..d2def5d6e7e 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -679,6 +679,8 @@ class ConstantComponent: public UpdatableComponent { virtual int32 NumParameters() const; virtual void Vectorize(VectorBase *params) const; virtual void UnVectorize(const VectorBase ¶ms); + + virtual void ConsolidateMemory(); private: // the output value-- a vector. diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index 69f8442a08a..7a5eb7017a3 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -417,8 +417,10 @@ void SigmoidComponent::RepairGradients( void SigmoidComponent::StoreStats(const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { - // only store stats about every other minibatch. - if (RandInt(0, 1) == 0) + // Only store stats about every other minibatch (but on the first minibatch, + // always store it, which is necessary for the ConsolidateMemory() operation + // to work correctly. + if (RandInt(0, 1) == 0 && count_ != 0) return; // derivative of the nonlinearity is out_value * (1.0 - out_value); CuMatrix temp_deriv(out_value.NumRows(), out_value.NumCols(), @@ -939,8 +941,10 @@ void TanhComponent::Backprop(const std::string &debug_info, void TanhComponent::StoreStats(const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { - // only store stats about every other minibatch. - if (RandInt(0, 1) == 0) + // Only store stats about every other minibatch (but on the first minibatch, + // always store it, which is necessary for the ConsolidateMemory() operation + // to work correctly. + if (RandInt(0, 1) == 0 && count_ != 0) return; // derivative of the onlinearity is out_value * (1.0 - out_value); CuMatrix temp_deriv(out_value); @@ -1073,8 +1077,10 @@ void RectifiedLinearComponent::StoreStats( const CuMatrixBase &in_value, const CuMatrixBase &out_value, void *memo) { - // only store stats about every other minibatch. - if (RandInt(0, 1) == 0) + // Only store stats about every other minibatch (but on the first minibatch, + // always store it, which is necessary for the ConsolidateMemory() operation + // to work correctly. + if (RandInt(0, 1) == 0 && count_ != 0) return; CuMatrix temp_deriv(out_value.NumRows(), out_value.NumCols(), @@ -1637,6 +1643,12 @@ void NaturalGradientRepeatedAffineComponent::Update( bias_params_.AddVec(learning_rate_ * scale, bias_deriv); } +void NaturalGradientRepeatedAffineComponent::ConsolidateMemory() { + OnlineNaturalGradient temp(preconditioner_in_); + preconditioner_in_.Swap(&temp); +} + + BlockAffineComponent::BlockAffineComponent(const BlockAffineComponent &other) : UpdatableComponent(other), linear_params_(other.linear_params_), @@ -2555,6 +2567,13 @@ void ScaleAndOffsetComponent::BackpropInternal( } } +void ScaleAndOffsetComponent::ConsolidateMemory() { + OnlineNaturalGradient temp_scale(scale_preconditioner_); + scale_preconditioner_.Swap(&temp_scale); + OnlineNaturalGradient temp_offset(offset_preconditioner_); + offset_preconditioner_.Swap(&temp_offset); +} + std::string ConstantFunctionComponent::Info() const { std::ostringstream stream; @@ -2744,7 +2763,10 @@ void ConstantFunctionComponent::UnVectorize(const VectorBase ¶ms) output_.CopyFromVec(params); } - +void ConstantFunctionComponent::ConsolidateMemory() { + OnlineNaturalGradient temp(preconditioner_); + preconditioner_.Swap(&temp); +} void NaturalGradientAffineComponent::Read(std::istream &is, bool binary) { ReadUpdatableCommon(is, binary); // Read the opening tag and learning rate @@ -3017,12 +3039,17 @@ void NaturalGradientAffineComponent::Add(BaseFloat alpha, const Component &other bias_params_.AddVec(alpha, other->bias_params_); } -/// virtual void NaturalGradientAffineComponent::FreezeNaturalGradient(bool freeze) { preconditioner_in_.Freeze(freeze); preconditioner_out_.Freeze(freeze); } +void NaturalGradientAffineComponent::ConsolidateMemory() { + OnlineNaturalGradient temp_in(preconditioner_in_); + preconditioner_in_.Swap(&temp_in); + OnlineNaturalGradient temp_out(preconditioner_out_); + preconditioner_out_.Swap(&temp_out); +} void LinearComponent::Read(std::istream &is, bool binary) { std::string token = ReadUpdatableCommon(is, binary); @@ -3291,6 +3318,12 @@ void LinearComponent::FreezeNaturalGradient(bool freeze) { preconditioner_out_.Freeze(freeze); } +void LinearComponent::ConsolidateMemory() { + OnlineNaturalGradient temp_in(preconditioner_in_); + preconditioner_in_.Swap(&temp_in); + OnlineNaturalGradient temp_out(preconditioner_out_); + preconditioner_out_.Swap(&temp_out); +} std::string FixedAffineComponent::Info() const { std::ostringstream stream; @@ -3900,11 +3933,15 @@ void NaturalGradientPerElementScaleComponent::Update( scales_.AddVec(1.0, delta_scales); } -/// virtual void NaturalGradientPerElementScaleComponent::FreezeNaturalGradient(bool freeze) { preconditioner_.Freeze(freeze); } +void NaturalGradientPerElementScaleComponent::ConsolidateMemory() { + OnlineNaturalGradient temp(preconditioner_); + preconditioner_.Swap(&temp); +} + // Constructors for the convolution component ConvolutionComponent::ConvolutionComponent(): UpdatableComponent(), @@ -5874,6 +5911,11 @@ void LstmNonlinearityComponent::InitFromConfig(ConfigLine *cfl) { } } +void LstmNonlinearityComponent::ConsolidateMemory() { + OnlineNaturalGradient preconditioner_temp(preconditioner_); + preconditioner_.Swap(&preconditioner_); +} + SumBlockComponent::SumBlockComponent(const SumBlockComponent &other): input_dim_(other.input_dim_), output_dim_(other.output_dim_), scale_(other.scale_) { } diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index 12ae99d716b..11c60f8f352 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -650,6 +650,9 @@ class NaturalGradientRepeatedAffineComponent: public RepeatedAffineComponent { // Copy constructor explicit NaturalGradientRepeatedAffineComponent( const NaturalGradientRepeatedAffineComponent &other); + + virtual void ConsolidateMemory(); + private: virtual void Update( const CuMatrixBase &in_value, @@ -832,6 +835,9 @@ class NaturalGradientAffineComponent: public AffineComponent { virtual void Scale(BaseFloat scale); virtual void Add(BaseFloat alpha, const Component &other); virtual void FreezeNaturalGradient(bool freeze); + + virtual void ConsolidateMemory(); + // copy constructor explicit NaturalGradientAffineComponent( const NaturalGradientAffineComponent &other); @@ -955,6 +961,8 @@ class LinearComponent: public UpdatableComponent { virtual void Vectorize(VectorBase *params) const; virtual void UnVectorize(const VectorBase ¶ms); virtual void FreezeNaturalGradient(bool freeze); + virtual void ConsolidateMemory(); + // copy constructor explicit LinearComponent(const LinearComponent &other); @@ -1715,6 +1723,7 @@ class ConstantFunctionComponent: public UpdatableComponent { virtual int32 NumParameters() const; virtual void Vectorize(VectorBase *params) const; virtual void UnVectorize(const VectorBase ¶ms); + virtual void ConsolidateMemory(); private: int32 input_dim_; // the output value-- a vector. @@ -1783,6 +1792,8 @@ class NaturalGradientPerElementScaleComponent: public PerElementScaleComponent { int32 rank, int32 update_period, BaseFloat num_samples_history, BaseFloat alpha); + void ConsolidateMemory(); + private: // unlike the NaturalGradientAffineComponent, there is only one dimension to // consider as the parameters are a vector not a matrix, so we only need one @@ -1888,6 +1899,7 @@ class ScaleAndOffsetComponent: public UpdatableComponent { virtual int32 NumParameters() const { return 2 * scales_.Dim(); } virtual void Vectorize(VectorBase *params) const; virtual void UnVectorize(const VectorBase ¶ms); + virtual void ConsolidateMemory(); // copy constructor @@ -2281,6 +2293,8 @@ class LstmNonlinearityComponent: public UpdatableComponent { BaseFloat sigmoid_self_repair_threshold, BaseFloat self_repair_scale); + virtual void ConsolidateMemory(); + private: // Initializes the natural-gradient object with the configuration we diff --git a/src/nnet3/nnet-tdnn-component.cc b/src/nnet3/nnet-tdnn-component.cc index 52ad1031a4c..c287ce303a6 100644 --- a/src/nnet3/nnet-tdnn-component.cc +++ b/src/nnet3/nnet-tdnn-component.cc @@ -694,6 +694,12 @@ void TdnnComponent::PrecomputedIndexes::Read( ExpectToken(is, binary, ""); } +void TdnnComponent::ConsolidateMemory() { + OnlineNaturalGradient temp_in(preconditioner_in_); + preconditioner_in_.Swap(&temp_in); + OnlineNaturalGradient temp_out(preconditioner_out_); + preconditioner_out_.Swap(&temp_out); +} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-training.cc b/src/nnet3/nnet-training.cc index 8fda24cd22d..0acaa5c2008 100644 --- a/src/nnet3/nnet-training.cc +++ b/src/nnet3/nnet-training.cc @@ -82,8 +82,12 @@ void NnetTrainer::Train(const NnetExample &eg) { } else { // conventional training TrainInternal(eg, *computation); } - + if (num_minibatches_processed_ == 0) { + ConsolidateMemory(nnet_); + ConsolidateMemory(delta_nnet_); + } num_minibatches_processed_++; + } void NnetTrainer::TrainInternal(const NnetExample &eg, diff --git a/src/nnet3/nnet-utils.cc b/src/nnet3/nnet-utils.cc index d16a728e2ab..e020f8fc6a7 100644 --- a/src/nnet3/nnet-utils.cc +++ b/src/nnet3/nnet-utils.cc @@ -1058,6 +1058,27 @@ void ConstrainOrthonormal(Nnet *nnet) { } } +void ConsolidateMemory(Nnet *nnet) { +#if HAVE_CUDA == 1 + if (CuDevice::Instantiate().Enabled()) { + bool print_memory_info = (GetVerboseLevel() >= 1); + if (print_memory_info) { + KALDI_VLOG(1) << "Consolidating memory; will print memory usage before " + "and after consolidating:"; + g_cuda_allocator.PrintMemoryUsage(); + } + for (int32 c = 0; c < nnet->NumComponents(); c++) { + Component *comp = nnet->GetComponent(c); + comp->ConsolidateMemory(); + } + if (print_memory_info) { + g_cuda_allocator.PrintMemoryUsage(); + } + } +#endif +} + + // This code has been broken out of ReadEditConfig as it's quite long. // It implements the internals of the edit directive 'reduce-rank'. @@ -2065,7 +2086,7 @@ bool UpdateNnetWithMaxChange(const Nnet &delta_nnet, ostr << "Per-component max-change active on " << num_max_change_per_component_applied_per_minibatch << " / " << num_updatable << " Updatable Components." - << "(smallest factor=" << min_scale << " on " + << " (Smallest factor=" << min_scale << " on " << component_name_with_min_scale << " with max-change=" << max_change_with_min_scale <<"). "; if (param_delta > max_param_change * max_change_scale) diff --git a/src/nnet3/nnet-utils.h b/src/nnet3/nnet-utils.h index c54fcf87e64..787bd228a38 100644 --- a/src/nnet3/nnet-utils.h +++ b/src/nnet3/nnet-utils.h @@ -476,6 +476,19 @@ void ScaleBatchnormStats(BaseFloat batchnorm_stats_scale, */ void ConstrainOrthonormal(Nnet *nnet); + +/** + This just calls ConsolidateMemory() on all the components of the nnet. This + is called by the training code after processing the first minibatch. On some + components this will do nothing; on some components it will reallocate + certain quantities that have been allocated during training (mostly the + contents of NaturalGradientOnline objects, and stats for NonlinearComponents) + so that they can be put into low memory. This will tend to minimize + memory fragmentation. Read comments in ../cudamatrix/cu-allocator.h for + more explanation. + */ +void ConsolidateMemory(Nnet *nnet); + /** This utility function can be used to obtain the number of distinct 'n' values in a training example. This is the number of examples (e.g. sequences) that have been combined into a single example. (Actually diff --git a/src/nnet3bin/nnet3-train.cc b/src/nnet3bin/nnet3-train.cc index 271af5d06dc..d3fbaa587e1 100644 --- a/src/nnet3bin/nnet3-train.cc +++ b/src/nnet3bin/nnet3-train.cc @@ -20,7 +20,7 @@ #include "base/kaldi-common.h" #include "util/common-utils.h" #include "nnet3/nnet-training.h" - +#include "cudamatrix/cu-allocator.h" int main(int argc, char *argv[]) { try { @@ -53,6 +53,7 @@ int main(int argc, char *argv[]) { "yes|no|optional|wait, only has effect if compiled with CUDA"); train_config.Register(&po); + RegisterCuAllocatorOptions(&po); po.Read(argc, argv); @@ -94,5 +95,3 @@ int main(int argc, char *argv[]) { return -1; } } - - diff --git a/src/rnnlm/rnnlm-core-training.cc b/src/rnnlm/rnnlm-core-training.cc index 63a6dee188d..5a1ae97895f 100644 --- a/src/rnnlm/rnnlm-core-training.cc +++ b/src/rnnlm/rnnlm-core-training.cc @@ -343,6 +343,11 @@ void RnnlmCoreTrainer::ProcessOutput( computer->AcceptInput("output", &output_deriv); } +void RnnlmCoreTrainer::ConsolidateMemory() { + kaldi::nnet3::ConsolidateMemory(nnet_); + kaldi::nnet3::ConsolidateMemory(delta_nnet_); +} + RnnlmCoreTrainer::~RnnlmCoreTrainer() { PrintMaxChangeStats(); // Note: the objective-function stats are printed out in the destructor of the diff --git a/src/rnnlm/rnnlm-core-training.h b/src/rnnlm/rnnlm-core-training.h index 8f5ce873ff1..dd5fcfebd95 100644 --- a/src/rnnlm/rnnlm-core-training.h +++ b/src/rnnlm/rnnlm-core-training.h @@ -189,6 +189,10 @@ class RnnlmCoreTrainer { // per-component max-change and global max-change were enforced. void PrintMaxChangeStats() const; + + // Calls ConsolidateMemory() on nnet_ and delta_nnet_. + void ConsolidateMemory(); + ~RnnlmCoreTrainer(); private: diff --git a/src/rnnlm/rnnlm-embedding-training.cc b/src/rnnlm/rnnlm-embedding-training.cc index 4c42bd4ab39..c4238c7356a 100644 --- a/src/rnnlm/rnnlm-embedding-training.cc +++ b/src/rnnlm/rnnlm-embedding-training.cc @@ -175,7 +175,7 @@ void RnnlmEmbeddingTrainer::Train( if (config_.l2_regularize > 0.0) { BaseFloat l2_term = -2 * config_.l2_regularize; if (l2_term != 0.0) { - embedding_deriv->AddToRows(l2_term, active_words, embedding_mat_); + embedding_deriv->AddRows(l2_term, *embedding_mat_, active_words); } } BaseFloat scale = 1.0; @@ -229,8 +229,8 @@ void RnnlmEmbeddingTrainer::TrainBackstitch( if (config_.l2_regularize > 0.0 && !is_backstitch_step1) { BaseFloat l2_term = -2 * config_.l2_regularize; if (l2_term != 0.0) { - embedding_deriv->AddMat(1.0 / (1.0 + config_.backstitch_training_scale) * - l2_term, *embedding_mat_); + embedding_deriv->AddRows(l2_term / (1.0 + config_.backstitch_training_scale), + *embedding_mat_, active_words); } } BaseFloat scale = 1.0; diff --git a/src/rnnlm/rnnlm-example-test.cc b/src/rnnlm/rnnlm-example-test.cc index 8b393acf4ff..ccfdd90bbea 100644 --- a/src/rnnlm/rnnlm-example-test.cc +++ b/src/rnnlm/rnnlm-example-test.cc @@ -305,6 +305,8 @@ int main() { SetVerboseLevel(4); CuDevice::Instantiate().PrintProfile(); #endif + + unlink("tmp.ark"); return 0; } diff --git a/src/rnnlm/rnnlm-training.cc b/src/rnnlm/rnnlm-training.cc index 370f6395dc0..6db4d6f05b4 100644 --- a/src/rnnlm/rnnlm-training.cc +++ b/src/rnnlm/rnnlm-training.cc @@ -110,6 +110,9 @@ void RnnlmTrainer::Train(RnnlmExample *minibatch) { active_word_features_trans_.Swap(&active_word_features_trans); TrainInternal(); + + if (num_minibatches_processed_ == 1) + core_trainer_->ConsolidateMemory(); } diff --git a/src/rnnlmbin/rnnlm-train.cc b/src/rnnlmbin/rnnlm-train.cc index 6a212dd4aad..d9107e310f5 100644 --- a/src/rnnlmbin/rnnlm-train.cc +++ b/src/rnnlmbin/rnnlm-train.cc @@ -22,7 +22,7 @@ #include "rnnlm/rnnlm-training.h" #include "rnnlm/rnnlm-example-utils.h" #include "nnet3/nnet-utils.h" - +#include "cudamatrix/cu-allocator.h" int main(int argc, char *argv[]) { try { @@ -93,6 +93,7 @@ int main(int argc, char *argv[]) { objective_config.Register(&po); + RegisterCuAllocatorOptions(&po); // register the core RNNLM training options options with the prefix "rnnlm", // so they will appear as --rnnlm.max-change and the like. This is done diff --git a/src/tree/context-dep.cc b/src/tree/context-dep.cc index 4eab67f52be..5583717633c 100644 --- a/src/tree/context-dep.cc +++ b/src/tree/context-dep.cc @@ -319,8 +319,8 @@ void ContextDependency::GetPdfInfo( ContextDependency* -MonophoneContextDependency(const std::vector phones, - const std::vector phone2num_pdf_classes) { +MonophoneContextDependency(const std::vector &phones, + const std::vector &phone2num_pdf_classes) { std::vector > phone_sets(phones.size()); for (size_t i = 0; i < phones.size(); i++) phone_sets[i].push_back(phones[i]); std::vector share_roots(phones.size(), false); // don't share roots. @@ -331,8 +331,8 @@ MonophoneContextDependency(const std::vector phones, } ContextDependency* -MonophoneContextDependencyShared(const std::vector > phone_sets, - const std::vector phone2num_pdf_classes) { +MonophoneContextDependencyShared(const std::vector > &phone_sets, + const std::vector &phone2num_pdf_classes) { std::vector share_roots(phone_sets.size(), false); // don't share roots. // N is context size, P = position of central phone (must be 0). int32 num_leaves = 0, P = 0, N = 1; diff --git a/src/tree/context-dep.h b/src/tree/context-dep.h index 6342d89667b..e69c26f8638 100644 --- a/src/tree/context-dep.h +++ b/src/tree/context-dep.h @@ -180,15 +180,15 @@ ContextDependency *GenRandContextDependencyLarge(const std::vector &phone // 0, 1, 2). ContextDependency* -MonophoneContextDependency(const std::vector phones, - const std::vector phone2num_pdf_classes); +MonophoneContextDependency(const std::vector &phones, + const std::vector &phone2num_pdf_classes); // MonophoneContextDependencyShared is as MonophoneContextDependency but lets // you define classes of phones which share pdfs (e.g. different stress-markers of a single // phone.) Each element of phone_classes is a set of phones that are in that class. ContextDependency* -MonophoneContextDependencyShared(const std::vector > phone_classes, - const std::vector phone2num_pdf_classes); +MonophoneContextDependencyShared(const std::vector > &phone_classes, + const std::vector &phone2num_pdf_classes); // Important note: diff --git a/src/util/kaldi-thread-test.cc b/src/util/kaldi-thread-test.cc index e1776859222..eb6b72d1ed4 100644 --- a/src/util/kaldi-thread-test.cc +++ b/src/util/kaldi-thread-test.cc @@ -128,6 +128,6 @@ void TestTaskSequencer() { int main() { using namespace kaldi; TestThreads(); - for (int32 i = 0; i < 1000; i++) + for (int32 i = 0; i < 10; i++) TestTaskSequencer(); } diff --git a/tools/extras/check_dependencies.sh b/tools/extras/check_dependencies.sh index cd9ec7f5c1e..dc3ad8fbe57 100755 --- a/tools/extras/check_dependencies.sh +++ b/tools/extras/check_dependencies.sh @@ -24,15 +24,15 @@ COMPILER_VER_INFO=$($CXX --version 2>/dev/null) case $COMPILER_VER_INFO in "") echo "$0: $CXX is not installed." - echo "$0: You need g++ >= 4.7, Apple Xcode >= 5.0 or clang >= 3.3." + echo "$0: You need g++ >= 4.8.3, Apple Xcode >= 5.0 or clang >= 3.3." status=1 ;; "g++ "* ) GCC_VER=$($CXX -dumpversion) GCC_VER_NUM=$(echo $GCC_VER | sed 's/\./ /g' | xargs printf "%d%02d%02d") - if [ $GCC_VER_NUM -lt 40700 ]; then + if [ $GCC_VER_NUM -lt 40803 ]; then echo "$0: $CXX (g++-$GCC_VER) is not supported." - echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3." + echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3." status=1 fi ;; @@ -42,7 +42,7 @@ case $COMPILER_VER_INFO in CLANG_VER_NUM=$(echo $COMPILER_VER_INFO | grep version | sed "s/.*clang-\([0-9]*\).*/\1/") if [ $CLANG_VER_NUM -lt 500 ]; then echo "$0: $CXX (Apple clang-$CLANG_VER) is not supported." - echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3." + echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3." status=1 fi ;; @@ -51,7 +51,7 @@ case $COMPILER_VER_INFO in CLANG_VER_NUM=$(echo $CLANG_VER | sed 's/\./ /g' | xargs printf "%d%02d") if [ $CLANG_VER_NUM -lt 303 ]; then echo "$0: $CXX (LLVM clang-$CLANG_VER) is not supported." - echo "$0: You need g++ >= 4.7, Apple clang >= 5.0 or LLVM clang >= 3.3." + echo "$0: You need g++ >= 4.8.3, Apple clang >= 5.0 or LLVM clang >= 3.3." status=1 fi ;; diff --git a/windows/INSTALL.md b/windows/INSTALL.md index cd9c77b1776..c48f2908e98 100644 --- a/windows/INSTALL.md +++ b/windows/INSTALL.md @@ -6,10 +6,10 @@ For cygwin installation, see the instructions in `../INSTALL`. ## Notes * The recipes (in egs/) will not work. There is no commitment to support Windows. - The Windows port of Kaldi is targeted at experienced developers who want - to program their own apps using the kaldi libraries and are able to do - the troubleshooting on their own. -* These instructions are valid November 2017, + The Windows port of Kaldi is targeted at experienced developers who want + to program their own apps using the kaldi libraries and are able to do + the troubleshooting on their own. +* These instructions are valid November 2017, [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS are supported * ATLAS is not supported and I personally have no intention to work on supporting it, as it requires whole cygwin environment @@ -19,7 +19,7 @@ For cygwin installation, see the instructions in `../INSTALL`. and we didn't test if the solutions work or not. * While the 32bit project files will still be generated, we don't really care if they work or not. They will be removed in the near future. -* The build process was validated using MSVC2017. We do not support earlier +* The build process was validated using MSVC2017. We do not support earlier releases (i.e. MSVC2015 and older). The reason is the C++11 support is still very buggy in the MS compiler. * We support only openfst-1.6.5 for now. @@ -36,7 +36,7 @@ For cygwin installation, see the instructions in `../INSTALL`. ## Compiling OpenFST Skip this section, if you have downloaded OpenFST project from https://github.com/kkm000/openfst.git and it already contains openfst.sln file in the root folder. If it is present you can directly open it with Visual Studio 17 and you do not need CMake. -------------------------- +------------------------- For compilation of OpenFST, you will need CMake installed. Simply go to https://cmake.org/download/ and download and install. Then, in the command line, run the following commands. Be very careful about writing the commands verbatim! @@ -45,7 +45,7 @@ Then, in the command line, run the following commands. Be very careful about wri $ mkdir build64 $ cd build64 $ cmake -G "Visual Studio 15 2017 Win64" ../ - + The last command will generate output looking similarly to this. Do not try to read too much into specific versions of the programs. -- The C compiler identification is MSVC 19.11.25547.0 @@ -73,20 +73,20 @@ The last command will generate output looking similarly to this. Do not try to r -- Generating done -- Build files have been written to: C:/Users/jtrmal/Documents/openfst/build64 -In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. -------------------------- +In the directory `build64`, find the file `openfst.sln` and open it using Visual Studio 17. +------------------------- - **Switch the configuration to `debug|Win64` and build the solution.** - **Do the same for configuration `release|Win64`.** + **Switch the configuration to `Debug|x64` and build the solution.** + **Do the same for configuration `Release|x64`.** If either of the two won't build, you should stop here and start figuring what's different! -## Compiling Kaldi - +## Compiling Kaldi + 1. Checkout Kaldi trunk, using [git](https://git-for-windows.github.io/) from https://github.com/kaldi-asr/kaldi.git Example: - + $ git clone https://github.com/kaldi-asr/kaldi.git kaldi There are two options to use for BLAS (linear algebra): [Intel® MKL](https://software.intel.com/en-us/intel-mkl) and OpenBLAS. [Intel® MKL](https://software.intel.com/en-us/intel-mkl) is made by Intel and is optimised @@ -124,7 +124,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In 4. Enter the `(kaldi)/windows` directory Example: - + (kaldi)/$ cd windows (kaldi)/windows $ pwd @@ -148,7 +148,7 @@ for their processors. It isn't free, but you can get [Community Licensing for In generate_solution.pl --vsver [--enable-cuda] [--enable-openblas] [--enable-mkl] `--enable-mkl` is the default so you shouldn't need to use it. If `--enable-openblas` is passed it disables MKL support. - CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). + CUDA is disabled by default. The default Visual Studio version is 15.0 (Visual Studio 2017). Please note that while we support generating the project for Visual Studio 2015, the C++11 support for that compiler is rather sub-par, i.e. it won't probably compile. When choosing Visual Studio 2015, you are on your own! @@ -161,10 +161,10 @@ for their processors. It isn't free, but you can get [Community Licensing for In (kaldi)/windows$ generate_solution.pl --vsver vs2017 --enable-cuda --enable-openblas 9. Run the script (kaldi)/windows/get_version.pl: - + (kaldi)/windows$ get_version.pl - -10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs_ + +10. Open the generated solution that was created in a subfolder (kaldi)/kaldiwin_vs_ in the visual studio and switch to **Debug|x64** (or **Release|x64**) and build. Expect 10 projects to fail, majority of them will fail because of missing include `portaudio.h`. The tests will fail to compile too -- this is because of deficiency of the script generate_solution.pl. We might fix it diff --git a/windows/variables.props.dev b/windows/variables.props.dev index d797f2f2abf..9fb2457c99c 100644 --- a/windows/variables.props.dev +++ b/windows/variables.props.dev @@ -7,7 +7,7 @@ C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows\mkl\ C:\Users\Yenda\Downloads\kaldi-svn\tools\OpenBLAS-v0.2.14-Win64-int32 C:\Users\jtrmal\Documents\openfst\ - C:\Users\jtrmal\Documents\openfst\build64\lib + C:\Users\jtrmal\Documents\openfst\build64