diff --git a/egs/gale_arabic/s5/local/split_wer_per_corpus.sh b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh new file mode 100755 index 00000000000..71c8adcc3fe --- /dev/null +++ b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Report WER for reports and conversational +# Copyright 2014 QCRI (author: Ahmed Ali) +# Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Arguments should be the gale folder, see ../run.sh for example." + exit 1; +fi + +[ -f ./path.sh ] && . ./path.sh + +#set -o pipefail -e + +galeFolder=$(readlink -f $1) +symtab=./data/lang/words.txt + +min_lmwt=7 +max_lmwt=20 + +for dir in exp/*/*decode*; do + for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + #echo "Processing: $dir $type" + rm -fr $dir/scoring_$type + mkdir -p $dir/scoring_$type/log + for x in $dir/scoring/*.tra $dir/scoring/test_filt.txt; do + cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x) + done + + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ + cat $dir/scoring_${type}/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT +done +done + +time=$(date +"%Y-%m-%d-%H-%M-%S") +echo "#RESULTS splits generated by $USER at $time" + +for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + echo -e "\n# WER $type" + for x in exp/*/*decode*; do + grep WER $x/wer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + + + + diff --git a/egs/gale_mandarin/s5/RESULTS b/egs/gale_mandarin/s5/RESULTS new file mode 100644 index 00000000000..b1edc568a0c --- /dev/null +++ b/egs/gale_mandarin/s5/RESULTS @@ -0,0 +1,266 @@ +# Get WER and CER +%WER 48.37 [ 263343 / 544398, 59464 ins, 61751 del, 142128 sub ] exp/tri1/decode/wer_14 +%WER 46.92 [ 255435 / 544398, 61084 ins, 56012 del, 138339 sub ] exp/tri2a/decode/wer_14 +%WER 46.45 [ 252879 / 544398, 64019 ins, 49291 del, 139569 sub ] exp/tri3b/decode.si/wer_14 +%WER 45.20 [ 246062 / 544398, 64493 ins, 46744 del, 134825 sub ] exp/tri2b/decode/wer_14 +%WER 41.30 [ 224836 / 544398, 62047 ins, 46613 del, 116176 sub ] exp/tri2b_mpe/decode_it3/wer_12 +%WER 41.06 [ 223547 / 544398, 67146 ins, 37475 del, 118926 sub ] exp/tri3b/decode/wer_14 +%WER 40.66 [ 221333 / 544398, 57785 ins, 45636 del, 117912 sub ] exp/tri2b_mmi/decode_it3/wer_11 +%WER 40.58 [ 220918 / 544398, 58174 ins, 52314 del, 110430 sub ] exp/tri2b_mpe/decode_it4/wer_13 +%WER 40.42 [ 220024 / 544398, 49748 ins, 58009 del, 112267 sub ] exp/tri2b_mmi/decode_it4/wer_11 +%WER 40.22 [ 218975 / 544398, 55657 ins, 50365 del, 112953 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_12 +%WER 39.77 [ 216506 / 544398, 65278 ins, 36165 del, 115063 sub ] exp/sgmm_5a/decode/wer_11 +%WER 39.69 [ 216051 / 544398, 53034 ins, 53491 del, 109526 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_11 +%WER 38.67 [ 210531 / 544398, 66344 ins, 31914 del, 112273 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_10 +%WER 38.18 [ 207867 / 544398, 65994 ins, 31883 del, 109990 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_10 +%WER 37.78 [ 205693 / 544398, 65686 ins, 31705 del, 108302 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_10 +%WER 37.51 [ 204229 / 544398, 65502 ins, 31771 del, 106956 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_10 +%WER 36.94 [ 201074 / 544398, 66470 ins, 30258 del, 104346 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_9 + +%WER 40.70 [ 360556 / 885790, 89973 ins, 82245 del, 188338 sub ] exp/tri1/decode/cer_13 +%WER 39.29 [ 348071 / 885790, 92415 ins, 74065 del, 181591 sub ] exp/tri2a/decode/cer_13 +%WER 38.68 [ 342642 / 885790, 98429 ins, 62154 del, 182059 sub ] exp/tri3b/decode.si/cer_12 +%WER 37.49 [ 332045 / 885790, 96932 ins, 61925 del, 173188 sub ] exp/tri2b/decode/cer_13 +%WER 33.85 [ 299862 / 885790, 93460 ins, 60231 del, 146171 sub ] exp/tri2b_mpe/decode_it3/cer_11 +%WER 33.49 [ 296629 / 885790, 86746 ins, 61534 del, 148349 sub ] exp/tri2b_mmi/decode_it3/cer_11 +%WER 33.37 [ 295570 / 885790, 80320 ins, 70288 del, 144962 sub ] exp/tri2b_mmi/decode_it4/cer_10 +%WER 33.30 [ 295009 / 885790, 99171 ins, 50231 del, 145607 sub ] exp/tri3b/decode/cer_13 +%WER 33.23 [ 294379 / 885790, 88389 ins, 68681 del, 137309 sub ] exp/tri2b_mpe/decode_it4/cer_12 +%WER 33.03 [ 292595 / 885790, 87700 ins, 61287 del, 143608 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_11 +%WER 32.60 [ 288751 / 885790, 83604 ins, 65659 del, 139488 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_10 +%WER 32.14 [ 284728 / 885790, 99089 ins, 45433 del, 140206 sub ] exp/sgmm_5a/decode/cer_10 +%WER 31.24 [ 276708 / 885790, 101134 ins, 39271 del, 136303 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_9 +%WER 30.82 [ 273013 / 885790, 100939 ins, 38720 del, 133354 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_9 +%WER 30.49 [ 270059 / 885790, 100834 ins, 38371 del, 130854 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_9 +%WER 30.25 [ 267980 / 885790, 100694 ins, 38242 del, 129044 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_9 +%WER 29.76 [ 263594 / 885790, 99415 ins, 39444 del, 124735 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_9 + +# Detailed WER on all corpus dev sets + +# WER test.LDC2013S04 +%WER 37.60 [ 35770 / 95137, 5670 ins, 7459 del, 22641 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2013S04_9 +%WER 38.20 [ 36338 / 95137, 5759 ins, 7315 del, 23264 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2013S04_9 +%WER 38.51 [ 36639 / 95137, 5768 ins, 7390 del, 23481 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2013S04_9 +%WER 39.07 [ 37173 / 95137, 5805 ins, 7425 del, 23943 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2013S04_9 +%WER 39.64 [ 37713 / 95137, 5843 ins, 7490 del, 24380 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2013S04_9 +%WER 40.88 [ 38894 / 95137, 5514 ins, 8378 del, 25002 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10 +%WER 41.71 [ 39680 / 95137, 5237 ins, 9772 del, 24671 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2013S04_10 +%WER 41.96 [ 39915 / 95137, 5626 ins, 8584 del, 25705 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13 +%WER 42.02 [ 39973 / 95137, 5539 ins, 8861 del, 25573 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2013S04_10 +%WER 42.13 [ 40081 / 95137, 5170 ins, 9891 del, 25020 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2013S04_11 +%WER 42.71 [ 40635 / 95137, 5332 ins, 9748 del, 25555 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2013S04_11 +%WER 42.72 [ 40643 / 95137, 5624 ins, 8835 del, 26184 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2013S04_10 +%WER 42.97 [ 40880 / 95137, 5278 ins, 10109 del, 25493 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2013S04_10 +%WER 47.10 [ 44807 / 95137, 5574 ins, 10120 del, 29113 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13 +%WER 48.93 [ 46555 / 95137, 5680 ins, 10447 del, 30428 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12 +%WER 49.38 [ 46982 / 95137, 4996 ins, 11786 del, 30200 sub ] exp/tri2a/decode/wer_test.LDC2013S04_14 +%WER 50.97 [ 48494 / 95137, 5175 ins, 11987 del, 31332 sub ] exp/tri1/decode/wer_test.LDC2013S04_13 + +# WER test.LDC2013S08 +%WER 22.16 [ 17707 / 79911, 3606 ins, 2589 del, 11512 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2013S08_8 +%WER 22.54 [ 18009 / 79911, 3486 ins, 2764 del, 11759 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2013S08_9 +%WER 22.84 [ 18253 / 79911, 3626 ins, 2612 del, 12015 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2013S08_8 +%WER 23.16 [ 18507 / 79911, 3506 ins, 2819 del, 12182 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2013S08_9 +%WER 23.62 [ 18877 / 79911, 3554 ins, 2849 del, 12474 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2013S08_9 +%WER 24.52 [ 19594 / 79911, 3688 ins, 3017 del, 12889 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_9 +%WER 25.25 [ 20177 / 79911, 3357 ins, 3442 del, 13378 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2013S08_10 +%WER 25.53 [ 20400 / 79911, 3346 ins, 3483 del, 13571 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2013S08_11 +%WER 25.59 [ 20447 / 79911, 3330 ins, 3814 del, 13303 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2013S08_12 +%WER 25.67 [ 20510 / 79911, 3660 ins, 3361 del, 13489 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13 +%WER 25.91 [ 20702 / 79911, 3295 ins, 3670 del, 13737 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2013S08_10 +%WER 25.93 [ 20721 / 79911, 3319 ins, 3532 del, 13870 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2013S08_11 +%WER 26.08 [ 20841 / 79911, 3418 ins, 3757 del, 13666 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2013S08_12 +%WER 29.35 [ 23450 / 79911, 3706 ins, 3910 del, 15834 sub ] exp/tri2b/decode/wer_test.LDC2013S08_13 +%WER 30.48 [ 24359 / 79911, 3831 ins, 3859 del, 16669 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11 +%WER 31.68 [ 25314 / 79911, 3637 ins, 4636 del, 17041 sub ] exp/tri2a/decode/wer_test.LDC2013S08_14 +%WER 33.01 [ 26375 / 79911, 3675 ins, 4743 del, 17957 sub ] exp/tri1/decode/wer_test.LDC2013S08_13 + +# WER test.LDC2014S09 +%WER 45.41 [ 35390 / 77932, 11018 ins, 5860 del, 18512 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2014S09_10 +%WER 46.00 [ 35848 / 77932, 10930 ins, 6053 del, 18865 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2014S09_11 +%WER 46.27 [ 36059 / 77932, 10673 ins, 6370 del, 19016 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2014S09_12 +%WER 46.57 [ 36293 / 77932, 11029 ins, 5994 del, 19270 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2014S09_11 +%WER 47.07 [ 36684 / 77932, 10819 ins, 6276 del, 19589 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2014S09_12 +%WER 47.80 [ 37249 / 77932, 7270 ins, 12090 del, 17889 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2014S09_11 +%WER 48.15 [ 37528 / 77932, 10807 ins, 6823 del, 19898 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12 +%WER 48.40 [ 37722 / 77932, 6651 ins, 13935 del, 17136 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2014S09_14 +%WER 48.52 [ 37812 / 77932, 6507 ins, 13163 del, 18142 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2014S09_11 +%WER 48.69 [ 37947 / 77932, 6897 ins, 12985 del, 18065 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2014S09_13 +%WER 48.75 [ 37995 / 77932, 8250 ins, 12319 del, 17426 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2014S09_14 +%WER 49.49 [ 38569 / 77932, 10792 ins, 7406 del, 20371 sub ] exp/tri3b/decode/wer_test.LDC2014S09_16 +%WER 49.55 [ 38615 / 77932, 8623 ins, 11835 del, 18157 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2014S09_14 +%WER 53.82 [ 41942 / 77932, 9645 ins, 10274 del, 22023 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16 +%WER 54.92 [ 42801 / 77932, 8585 ins, 13190 del, 21026 sub ] exp/tri2a/decode/wer_test.LDC2014S09_16 +%WER 55.33 [ 43118 / 77932, 10169 ins, 9959 del, 22990 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15 +%WER 56.01 [ 43648 / 77932, 7925 ins, 14882 del, 20841 sub ] exp/tri1/decode/wer_test.LDC2014S09_16 + +# WER test.LDC2015S06 +%WER 41.65 [ 25659 / 61612, 8345 ins, 4519 del, 12795 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2015S06_9 +%WER 42.31 [ 26067 / 61612, 8572 ins, 4202 del, 13293 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2015S06_8 +%WER 42.64 [ 26271 / 61612, 8588 ins, 4225 del, 13458 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2015S06_8 +%WER 43.00 [ 26491 / 61612, 8453 ins, 4486 del, 13552 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2015S06_9 +%WER 43.57 [ 26846 / 61612, 8485 ins, 4545 del, 13816 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2015S06_9 +%WER 44.64 [ 27503 / 61612, 8428 ins, 4884 del, 14191 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_10 +%WER 45.50 [ 28034 / 61612, 7886 ins, 6546 del, 13602 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2015S06_9 +%WER 45.71 [ 28165 / 61612, 7943 ins, 6204 del, 14018 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2015S06_10 +%WER 46.14 [ 28428 / 61612, 8541 ins, 5351 del, 14536 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13 +%WER 46.15 [ 28434 / 61612, 8006 ins, 6925 del, 13503 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2015S06_11 +%WER 46.19 [ 28459 / 61612, 8143 ins, 5704 del, 14612 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2015S06_9 +%WER 46.35 [ 28555 / 61612, 7379 ins, 7453 del, 13723 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2015S06_10 +%WER 46.66 [ 28751 / 61612, 8068 ins, 6749 del, 13934 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2015S06_11 +%WER 50.45 [ 31086 / 61612, 8308 ins, 6588 del, 16190 sub ] exp/tri2b/decode/wer_test.LDC2015S06_13 +%WER 51.27 [ 31586 / 61612, 8305 ins, 6656 del, 16625 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_12 +%WER 52.65 [ 32436 / 61612, 8220 ins, 7524 del, 16692 sub ] exp/tri2a/decode/wer_test.LDC2015S06_12 +%WER 54.21 [ 33398 / 61612, 8128 ins, 8138 del, 17132 sub ] exp/tri1/decode/wer_test.LDC2015S06_12 + +# WER test.LDC2015S13 +%WER 19.24 [ 16273 / 84594, 2118 ins, 2624 del, 11531 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2015S13_8 +%WER 19.68 [ 16647 / 84594, 2117 ins, 2638 del, 11892 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2015S13_8 +%WER 20.02 [ 16936 / 84594, 2156 ins, 2665 del, 12115 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2015S13_8 +%WER 20.31 [ 17178 / 84594, 2179 ins, 2724 del, 12275 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2015S13_8 +%WER 20.68 [ 17494 / 84594, 2111 ins, 2905 del, 12478 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2015S13_9 +%WER 21.58 [ 18255 / 84594, 2222 ins, 3059 del, 12974 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9 +%WER 22.08 [ 18678 / 84594, 1796 ins, 3753 del, 13129 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2015S13_10 +%WER 22.35 [ 18907 / 84594, 1921 ins, 3449 del, 13537 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2015S13_10 +%WER 22.54 [ 19066 / 84594, 1700 ins, 4044 del, 13322 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2015S13_12 +%WER 22.59 [ 19108 / 84594, 1821 ins, 3889 del, 13398 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2015S13_10 +%WER 22.64 [ 19152 / 84594, 2243 ins, 3274 del, 13635 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12 +%WER 22.68 [ 19187 / 84594, 1950 ins, 3444 del, 13793 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2015S13_10 +%WER 22.97 [ 19429 / 84594, 1748 ins, 4021 del, 13660 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2015S13_12 +%WER 26.22 [ 22178 / 84594, 2285 ins, 3818 del, 16075 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12 +%WER 27.69 [ 23425 / 84594, 2163 ins, 4348 del, 16914 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12 +%WER 27.74 [ 23470 / 84594, 2137 ins, 4463 del, 16870 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12 +%WER 29.10 [ 24619 / 84594, 1912 ins, 5352 del, 17355 sub ] exp/tri1/decode/wer_test.LDC2015S13_13 + +# WER test.LDC2016S03 +%WER 48.17 [ 69952 / 145212, 34989 ins, 7540 del, 27423 sub ] exp/sgmm_5a_mmi_b0.1/decode/wer_test.LDC2016S03_10 +%WER 48.71 [ 70739 / 145212, 34599 ins, 7965 del, 28175 sub ] exp/sgmm_5a_mmi_b0.1/decode4/wer_test.LDC2016S03_11 +%WER 48.97 [ 71110 / 145212, 25206 ins, 19110 del, 26794 sub ] exp/tri2b_mmi_b0.1/decode_it4/wer_test.LDC2016S03_13 +%WER 48.99 [ 71138 / 145212, 34706 ins, 7910 del, 28522 sub ] exp/sgmm_5a_mmi_b0.1/decode3/wer_test.LDC2016S03_11 +%WER 49.39 [ 71725 / 145212, 34857 ins, 7904 del, 28964 sub ] exp/sgmm_5a_mmi_b0.1/decode2/wer_test.LDC2016S03_11 +%WER 49.68 [ 72141 / 145212, 24534 ins, 19597 del, 28010 sub ] exp/tri2b_mmi/decode_it4/wer_test.LDC2016S03_12 +%WER 49.84 [ 72372 / 145212, 34400 ins, 8414 del, 29558 sub ] exp/sgmm_5a_mmi_b0.1/decode1/wer_test.LDC2016S03_12 +%WER 49.90 [ 72459 / 145212, 27743 ins, 16413 del, 28303 sub ] exp/tri2b_mmi_b0.1/decode_it3/wer_test.LDC2016S03_14 +%WER 50.19 [ 72877 / 145212, 26408 ins, 17847 del, 28622 sub ] exp/tri2b_mmi/decode_it3/wer_test.LDC2016S03_14 +%WER 50.91 [ 73932 / 145212, 33830 ins, 9535 del, 30567 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_13 +%WER 51.11 [ 74219 / 145212, 30731 ins, 14606 del, 28882 sub ] exp/tri2b_mpe/decode_it4/wer_test.LDC2016S03_15 +%WER 52.08 [ 75631 / 145212, 31779 ins, 13885 del, 29967 sub ] exp/tri2b_mpe/decode_it3/wer_test.LDC2016S03_15 +%WER 52.52 [ 76271 / 145212, 35202 ins, 9563 del, 31506 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17 +%WER 56.58 [ 82157 / 145212, 34695 ins, 11508 del, 35954 sub ] exp/tri2b/decode/wer_test.LDC2016S03_15 +%WER 57.28 [ 83179 / 145212, 33956 ins, 12500 del, 36723 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_16 +%WER 57.77 [ 83895 / 145212, 31963 ins, 14939 del, 36993 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16 +%WER 59.27 [ 86074 / 145212, 30962 ins, 17056 del, 38056 sub ] exp/tri1/decode/wer_test.LDC2016S03_17 + +# CER test.LDC2013S04 +%WER 29.58 [ 45038 / 152279, 8264 ins, 9223 del, 27551 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2013S04_8 +%WER 30.12 [ 45873 / 152279, 8362 ins, 9107 del, 28404 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2013S04_8 +%WER 30.45 [ 46374 / 152279, 8377 ins, 9153 del, 28844 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2013S04_8 +%WER 30.95 [ 47132 / 152279, 8143 ins, 9769 del, 29220 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2013S04_9 +%WER 31.42 [ 47853 / 152279, 8134 ins, 9921 del, 29798 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2013S04_9 +%WER 32.54 [ 49545 / 152279, 7985 ins, 10732 del, 30828 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_9 +%WER 33.42 [ 50894 / 152279, 7672 ins, 11845 del, 31377 sub ] exp/tri3b/decode/cer_test.LDC2013S04_12 +%WER 33.72 [ 51348 / 152279, 7214 ins, 13316 del, 30818 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2013S04_10 +%WER 33.94 [ 51680 / 152279, 6748 ins, 14248 del, 30684 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2013S04_11 +%WER 33.98 [ 51737 / 152279, 7729 ins, 11833 del, 32175 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2013S04_10 +%WER 34.64 [ 52757 / 152279, 8561 ins, 10696 del, 33500 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2013S04_9 +%WER 34.67 [ 52802 / 152279, 7277 ins, 13490 del, 32035 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2013S04_11 +%WER 34.88 [ 53115 / 152279, 8059 ins, 12270 del, 32786 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2013S04_9 +%WER 38.75 [ 59002 / 152279, 7909 ins, 13549 del, 37544 sub ] exp/tri2b/decode/cer_test.LDC2013S04_12 +%WER 40.49 [ 61655 / 152279, 8366 ins, 13257 del, 40032 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10 +%WER 41.22 [ 62774 / 152279, 7165 ins, 15963 del, 39646 sub ] exp/tri2a/decode/cer_test.LDC2013S04_13 +%WER 42.69 [ 65004 / 152279, 7307 ins, 16188 del, 41509 sub ] exp/tri1/decode/cer_test.LDC2013S04_12 + +# CER test.LDC2013S08 +%WER 15.96 [ 21136 / 132434, 4775 ins, 3000 del, 13361 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2013S08_8 +%WER 16.30 [ 21593 / 132434, 4859 ins, 2856 del, 13878 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2013S08_7 +%WER 16.55 [ 21914 / 132434, 4786 ins, 3035 del, 14093 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2013S08_8 +%WER 16.82 [ 22272 / 132434, 4795 ins, 3084 del, 14393 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2013S08_8 +%WER 17.19 [ 22766 / 132434, 4804 ins, 3151 del, 14811 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2013S08_8 +%WER 17.90 [ 23712 / 132434, 4898 ins, 3447 del, 15367 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8 +%WER 18.75 [ 24836 / 132434, 4339 ins, 4148 del, 16349 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2013S08_10 +%WER 18.96 [ 25105 / 132434, 4832 ins, 3953 del, 16320 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11 +%WER 19.00 [ 25164 / 132434, 4160 ins, 4851 del, 16153 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2013S08_12 +%WER 19.01 [ 25182 / 132434, 4569 ins, 3748 del, 16865 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2013S08_10 +%WER 19.38 [ 25671 / 132434, 4798 ins, 3359 del, 17514 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2013S08_9 +%WER 19.42 [ 25716 / 132434, 4571 ins, 3923 del, 17222 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2013S08_9 +%WER 19.43 [ 25738 / 132434, 4303 ins, 4685 del, 16750 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2013S08_12 +%WER 22.36 [ 29618 / 132434, 5010 ins, 4337 del, 20271 sub ] exp/tri2b/decode/cer_test.LDC2013S08_11 +%WER 23.38 [ 30959 / 132434, 4820 ins, 4772 del, 21367 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11 +%WER 24.48 [ 32421 / 132434, 4829 ins, 5141 del, 22451 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12 +%WER 25.74 [ 34093 / 132434, 4727 ins, 5715 del, 23651 sub ] exp/tri1/decode/cer_test.LDC2013S08_12 + +# CER test.LDC2014S09 +%WER 37.36 [ 47080 / 126027, 16306 ins, 8137 del, 22637 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2014S09_10 +%WER 37.96 [ 47842 / 126027, 16721 ins, 7781 del, 23340 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2014S09_10 +%WER 38.20 [ 48139 / 126027, 17115 ins, 7320 del, 23704 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2014S09_9 +%WER 38.48 [ 48500 / 126027, 16796 ins, 7735 del, 23969 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2014S09_10 +%WER 38.88 [ 48998 / 126027, 16815 ins, 7810 del, 24373 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2014S09_10 +%WER 39.70 [ 50034 / 126027, 16290 ins, 9049 del, 24695 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11 +%WER 40.55 [ 51106 / 126027, 12063 ins, 15356 del, 23687 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2014S09_10 +%WER 41.02 [ 51702 / 126027, 13460 ins, 15578 del, 22664 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2014S09_12 +%WER 41.03 [ 51703 / 126027, 13360 ins, 13875 del, 24468 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2014S09_11 +%WER 41.08 [ 51774 / 126027, 16137 ins, 10241 del, 25396 sub ] exp/tri3b/decode/cer_test.LDC2014S09_14 +%WER 41.43 [ 52216 / 126027, 12943 ins, 14013 del, 25260 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2014S09_11 +%WER 41.44 [ 52223 / 126027, 11129 ins, 16614 del, 24480 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2014S09_10 +%WER 41.71 [ 52571 / 126027, 13916 ins, 15018 del, 23637 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2014S09_12 +%WER 45.78 [ 57690 / 126027, 14874 ins, 14113 del, 28703 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14 +%WER 47.18 [ 59462 / 126027, 14177 ins, 16374 del, 28911 sub ] exp/tri2a/decode/cer_test.LDC2014S09_13 +%WER 47.21 [ 59502 / 126027, 15174 ins, 14317 del, 30011 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_14 +%WER 48.49 [ 61109 / 126027, 13331 ins, 18880 del, 28898 sub ] exp/tri1/decode/cer_test.LDC2014S09_13 + +# CER test.LDC2015S06 +%WER 34.45 [ 34148 / 99132, 12809 ins, 5824 del, 15515 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2015S06_8 +%WER 34.95 [ 34650 / 99132, 12868 ins, 5686 del, 16096 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2015S06_8 +%WER 35.23 [ 34921 / 99132, 12869 ins, 5752 del, 16300 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2015S06_8 +%WER 35.53 [ 35225 / 99132, 12701 ins, 6047 del, 16477 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2015S06_9 +%WER 36.01 [ 35700 / 99132, 12722 ins, 6147 del, 16831 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2015S06_9 +%WER 36.86 [ 36538 / 99132, 12698 ins, 6493 del, 17347 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9 +%WER 38.28 [ 37946 / 99132, 11826 ins, 9192 del, 16928 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2015S06_9 +%WER 38.34 [ 38009 / 99132, 12635 ins, 7528 del, 17846 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12 +%WER 38.35 [ 38016 / 99132, 12245 ins, 7821 del, 17950 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2015S06_9 +%WER 38.55 [ 38211 / 99132, 12035 ins, 9225 del, 16951 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2015S06_10 +%WER 38.88 [ 38546 / 99132, 12302 ins, 7668 del, 18576 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2015S06_9 +%WER 39.01 [ 38672 / 99132, 12118 ins, 8917 del, 17637 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2015S06_10 +%WER 39.27 [ 38931 / 99132, 11682 ins, 9477 del, 17772 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2015S06_9 +%WER 42.63 [ 42261 / 99132, 12629 ins, 8447 del, 21185 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11 +%WER 43.52 [ 43141 / 99132, 12665 ins, 8709 del, 21767 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10 +%WER 44.95 [ 44562 / 99132, 12330 ins, 10278 del, 21954 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11 +%WER 46.55 [ 46143 / 99132, 12202 ins, 11242 del, 22699 sub ] exp/tri1/decode/cer_test.LDC2015S06_11 + +# CER test.LDC2015S13 +%WER 13.50 [ 19001 / 140702, 2366 ins, 2994 del, 13641 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2015S13_8 +%WER 13.88 [ 19524 / 140702, 2365 ins, 2990 del, 14169 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2015S13_8 +%WER 14.11 [ 19858 / 140702, 2383 ins, 3013 del, 14462 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2015S13_8 +%WER 14.33 [ 20158 / 140702, 2389 ins, 3059 del, 14710 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2015S13_8 +%WER 14.67 [ 20640 / 140702, 2482 ins, 2990 del, 15168 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2015S13_7 +%WER 15.42 [ 21702 / 140702, 2337 ins, 3609 del, 15756 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_9 +%WER 15.97 [ 22475 / 140702, 1954 ins, 4050 del, 16471 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2015S13_9 +%WER 16.23 [ 22838 / 140702, 1944 ins, 3994 del, 16900 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2015S13_10 +%WER 16.43 [ 23116 / 140702, 1765 ins, 4775 del, 16576 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2015S13_11 +%WER 16.44 [ 23135 / 140702, 2306 ins, 3973 del, 16856 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11 +%WER 16.50 [ 23214 / 140702, 2058 ins, 4180 del, 16976 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2015S13_9 +%WER 16.56 [ 23296 / 140702, 2211 ins, 3512 del, 17573 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2015S13_9 +%WER 16.81 [ 23654 / 140702, 1810 ins, 4651 del, 17193 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2015S13_11 +%WER 19.63 [ 27616 / 140702, 2395 ins, 4467 del, 20754 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11 +%WER 21.01 [ 29562 / 140702, 2365 ins, 4751 del, 22446 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_10 +%WER 21.15 [ 29758 / 140702, 2017 ins, 5741 del, 22000 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12 +%WER 22.48 [ 31633 / 140702, 1928 ins, 6389 del, 23316 sub ] exp/tri1/decode/cer_test.LDC2015S13_12 + +# CER test.LDC2016S03 +%WER 40.99 [ 96423 / 235216, 54232 ins, 10004 del, 32187 sub ] exp/sgmm_5a_mmi_b0.1/decode/cer_test.LDC2016S03_10 +%WER 41.52 [ 97659 / 235216, 54881 ins, 9585 del, 33193 sub ] exp/sgmm_5a_mmi_b0.1/decode4/cer_test.LDC2016S03_10 +%WER 41.78 [ 98270 / 235216, 55048 ins, 9598 del, 33624 sub ] exp/sgmm_5a_mmi_b0.1/decode3/cer_test.LDC2016S03_10 +%WER 42.11 [ 99051 / 235216, 54335 ins, 10340 del, 34376 sub ] exp/sgmm_5a_mmi_b0.1/decode2/cer_test.LDC2016S03_11 +%WER 42.28 [ 99452 / 235216, 41779 ins, 24370 del, 33303 sub ] exp/tri2b_mmi_b0.1/decode_it4/cer_test.LDC2016S03_12 +%WER 42.50 [ 99971 / 235216, 53658 ins, 11074 del, 35239 sub ] exp/sgmm_5a_mmi_b0.1/decode1/cer_test.LDC2016S03_12 +%WER 43.03 [ 101223 / 235216, 38061 ins, 29066 del, 34096 sub ] exp/tri2b_mmi/decode_it4/cer_test.LDC2016S03_12 +%WER 43.11 [ 101399 / 235216, 43075 ins, 23843 del, 34481 sub ] exp/tri2b_mmi_b0.1/decode_it3/cer_test.LDC2016S03_14 +%WER 43.29 [ 101832 / 235216, 53048 ins, 12380 del, 36404 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_12 +%WER 43.33 [ 101926 / 235216, 43250 ins, 22694 del, 35982 sub ] exp/tri2b_mmi/decode_it3/cer_test.LDC2016S03_13 +%WER 43.86 [ 103167 / 235216, 48178 ins, 19999 del, 34990 sub ] exp/tri2b_mpe/decode_it4/cer_test.LDC2016S03_14 +%WER 44.76 [ 105279 / 235216, 54744 ins, 12548 del, 37987 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15 +%WER 44.77 [ 105298 / 235216, 49827 ins, 18866 del, 36605 sub ] exp/tri2b_mpe/decode_it3/cer_test.LDC2016S03_14 +%WER 48.68 [ 114501 / 235216, 52710 ins, 16670 del, 45121 sub ] exp/tri2b/decode/cer_test.LDC2016S03_15 +%WER 49.57 [ 116592 / 235216, 53572 ins, 16111 del, 46909 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_14 +%WER 50.03 [ 117681 / 235216, 49653 ins, 20856 del, 47172 sub ] exp/tri2a/decode/cer_test.LDC2016S03_15 +%WER 51.54 [ 121232 / 235216, 49022 ins, 23008 del, 49202 sub ] exp/tri1/decode/cer_test.LDC2016S03_15 diff --git a/egs/gale_mandarin/s5/conf/decode.config b/egs/gale_mandarin/s5/conf/decode.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/gale_mandarin/s5/local/bad_utts b/egs/gale_mandarin/s5/local/bad_utts new file mode 100644 index 00000000000..6683c9a97a5 --- /dev/null +++ b/egs/gale_mandarin/s5/local/bad_utts @@ -0,0 +1,12 @@ +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070308_040701 +CCTV2_ECONOMYANDLAW_CMN_20070426_202800 +CCTV2_ECONOMYANDLAW_CMN_20070426_202800(1) +CCTV2_LIANGHUI_PROBLEM_20070308_213000 +CCTV4_TDYFOCUS_CMN_20070824_092801 +VOA_ISSUESANDOPINIONS_CMN_20070801_210500 +VOA_ISSUESANDOPINIONS_CMN_20070926_210500 +VOA_LISTENERSHOTLINE_CMN_20070906_223000 +VOA_LISTENERSHOTLINE_CMN_20070926_223000 +VOA_LISTENERSHOTLINE_CMN_20070927_223000 +PHOENIX_NEWSLINE_CMN_20070101_114800 +PHOENIX_NEWSLINE_CMN_20070101_114800(1) diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh index c2d61cfb864..598c6b37c17 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh @@ -1,46 +1,69 @@ -#!/bin/bash +#!/bin/bash # Copyright 2014 QCRI (author: Ahmed Ali) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 -if [ $# -ne 2 ]; then - echo "Arguments should be the "; exit 1 -fi +echo $0 "$@" + +galeData=$(readlink -f "${@: -1}" ); +wavedir=$galeData/wav +mkdir -p $wavedir + -# check that sox is installed +length=$(($#-1)) +args=${@:1:$length} +# check that sox is installed which sox &>/dev/null -if [[ $? != 0 ]]; then - echo "sox is not installed" - exit 1 +if [[ $? != 0 ]]; then + echo "$0: sox is not installed" + exit 1 fi -galeData=$1 -wavedir=$galeData/wav -mkdir -p $wavedir +set -e -o pipefail + +for var in $args; do + CD=$(basename $var) + [ -d $wavedir/$CD ] && rm -rf $wavedir/$CD + mkdir -p $wavedir/$CD + find $var -type f -name *.wav | while read file; do + f=$(basename $file) + if [[ ! -L "$wavedir/$CD/$f" ]]; then + ln -sf $file $wavedir/$CD/$f + fi + done -audio_path=$2 - -mkdir -p $wavedir/ - -#copy and convert the flac to wav -find $audio_path -type f -name *.flac | while read file; do - f_name=$(basename $file) - if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then - echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" " - sox $file $wavedir/"${f_name%.flac}.wav" - fi - + #make an flac symmlink as well + find $var -type f -name *.flac | while read file; do + f=$(basename $file) + + if [[ ! -L "$wavedir/$CD/$f" ]]; then + ln -sf $file $wavedir/$CD/$f + fi + done done -find $wavedir -name *.wav > $galeData/wav$$ -awk -F "/" '{print $NF}' $galeData/wav$$ | sed 's:\.wav::' > $galeData/id$$ -paste -d ' ' $galeData/id$$ $galeData/wav$$ | sort -u > $galeData/wav.scp +#figure out the proper sox command line +#the flac will be converted on the fly +( + for w in `find $wavedir -name *.wav` ; do + base=`basename $w .wav` + fullpath=`readlink -f $w` + echo "$base sox $fullpath -r 16000 -t wav - |" + done + + for w in `find $wavedir -name *.flac` ; do + base=`basename $w .flac` + fullpath=`readlink -f $w` + echo "$base sox $fullpath -r 16000 -t wav - |" + done +) | sort -u > $galeData/wav.scp -#clean +#clean rm -fr $galeData/id$$ $galeData/wav$$ -echo data prep audio succeded +echo "$0: data prep audio succeded" exit 0 diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh index 63b6d8d2f7b..40c29415a1e 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh @@ -1,37 +1,33 @@ -#!/bin/bash +#!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 if [ $# -ne 1 ]; then echo "Arguments should be the "; exit 1 fi +set -e -o pipefail #data will data/local galeData=$(readlink -f $1) mkdir -p data/local dir=$(readlink -f data/local) -cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist - -cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled - -# we want about 6h dev data; 300 is manually chosen -cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev - - -cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list # some problem with the text data; same utt id but different transcription -cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list +cat $galeData/all | awk '{print$2}' | \ + sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list -utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup +utils/filter_scp.pl --exclude -f 2 \ + $galeData/dup.list $galeData/all > $galeData/all.nodup -mv $galeData/all_nodup $galeData/all +mv $galeData/all $galeData/all.orig +mv $galeData/all.nodup $galeData/all -utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev -utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train +grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev +grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list @@ -46,11 +42,11 @@ utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt for x in dev train; do outdir=$dir/$x - file=$galeData/all.$x + file=$galeData/all.$x mkdir -p $outdir awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text -done +done cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list @@ -60,5 +56,6 @@ utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} {if (seen[$1]) { print $0}}' > $dir/train/wav.scp - + + echo data prep split succeeded diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh index 3fe32055f6c..7e3e57c92a8 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh @@ -1,28 +1,38 @@ #!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 -if [ $# -ne 2 ]; then - echo "Arguments should be the "; exit 1 -fi - +echo $0 "$@" export LC_ALL=C -galeData=$1 -text=$2 +galeData=$(readlink -f "${@: -1}" ); -cur=`pwd` +length=$(($#-1)) +args=${@:1:$length} -txtdir=$galeData/txt -mkdir -p $galeData/txt +top_pwd=`pwd` +txtdir=$galeData/txt +mkdir -p $txtdir cd $txtdir -find $text -type f -name *.tdf | while read file; do -sed '1,3d' $file -done > all.tmp +for cdx in ${args[@]}; do + echo "Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + tgt=$(basename $cdx) + test -x $tgt || ln -s $cdx `basename $tgt` + else + echo "I don't really know what I shall do with $cdx " >&2 + fi +done +find -L . -type f -name *.tdf | while read file; do +sed '1,3d' $file +done > all.tmp perl -e ' ($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0]; @@ -34,22 +44,35 @@ perl -e ' while () { @arr= split /\t/,$_; $arr[4] =~ s/ //g; + $arr[4] = sprintf("%020s", $arr[4]); $spkid = "$arr[0]_$arr[4]"; - $spkfix = sprintf("%060s", $spkid); - $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning - $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; - $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n"; - next if ($rStart == $rEnd); - $id =~ s/.sph//g; - print ID $id; + $spkfix = sprintf("%080s", $spkid); + + $start=sprintf ("%0.3f",$arr[2]); + $rStart=$start; + $start=~s/\.//; + $start=~s/^0+$/0/; + $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $start = sprintf("%09s", $start); + + $end=sprintf ("%0.3f",$arr[3]); + $rEnd=$end; + $end=~s/^0+([^0])/$1/; + $end=~s/\.//; + $end = sprintf("%09s", $end); + + $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; print TXT "$arr[7]\n"; print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n"; print MAP "$arr[0] ${spkfix}_$arr[0]\n"; - }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" + }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" perl -p -i -e 's=/.$==g' contentall.tmp -cd $cur +cd $top_pwd pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` @@ -57,11 +80,11 @@ export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-pa if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then echo "--- Downloading mmseg-1.3.0 ..." echo "NOTE: it assumes that you have Python, Setuptools installed on your system!" - wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz + wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz tar xf tools/mmseg-1.3.0.tar.gz -C tools cd tools/mmseg-1.3.0 mkdir -p lib/python${pyver}/site-packages - python setup.py build + CC=gcc CXX=g++ python setup.py build python setup.py install --prefix=. cd ../.. if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then @@ -90,11 +113,8 @@ awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk cat $txtdir/map.tmp | sort -u > $txtdir/../map -sort -c $txtdir/../utt2spk +sort -c $txtdir/../utt2spk utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt -cd ..; -rm -fr $txtdir - echo data prep text succeeded diff --git a/egs/gale_mandarin/s5/local/gale_format_data.sh b/egs/gale_mandarin/s5/local/gale_format_data.sh index 71187e89a12..204fa31fd42 100755 --- a/egs/gale_mandarin/s5/local/gale_format_data.sh +++ b/egs/gale_mandarin/s5/local/gale_format_data.sh @@ -8,19 +8,20 @@ if [ -f path.sh ]; then echo "missing path.sh"; exit 1; fi +set -e -o pipefail +set -x + for dir in dev train; do - cp -pr data/local/$dir data/$dir + cp -prT data/local/$dir data/$dir done export LC_ALL=C -mkdir -p data/lang_dev - arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -rm -r data/lang_dev -cp -r data/lang data/lang_dev +rm -r data/lang_test || true +cp -r data/lang data/lang_test gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ @@ -28,31 +29,35 @@ gunzip -c "$arpa_lm" | \ echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_dev/G.fst +fstisstochastic data/lang_test/G.fst || true ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - +( + fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head +) || true echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_dev/G.fst /dev/null || echo Error determinizing G. +fstdeterminize data/lang_test/G.fst /dev/null || { + echo Error determinizing G. + exit 1 +} # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_dev/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_dev/L_disambig.fst data/lang_dev/G.fst | \ +fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_dev/G.fst | \ +fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo LG is not stochastic diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh index cd3ed602c70..f1e39fb452e 100755 --- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh +++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh @@ -1,13 +1,14 @@ #!/bin/bash # prepare dictionary for HKUST -# it is done for English and Chinese separately, +# it is done for English and Chinese separately, # For English, we use CMU dictionary, and Sequitur G2P # for OOVs, while all englist phone set will concert to Chinese # phone set at the end. For Chinese, we use an online dictionary, # for OOV, we just produce pronunciation using Charactrt Mapping. - -. path.sh +. ./path.sh + +set -e -o pipefail [ $# != 0 ] && echo "Usage: local/hkust_prepare_dict.sh" && exit 1; train_dir=data/local/train @@ -23,18 +24,29 @@ esac # extract full vocabulary cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\ - sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\ - grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt + sed -e 's/ /\n/g' | sort -u | \ + grep -v '\[LAUGHTER\]' | \ + grep -v '\[NOISE\]' |\ + grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt # split into English and Chinese cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt -cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt +cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \ + perl -CSD -Mutf8 -ane '{print if /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-ch.txt +cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \ + perl -CSD -Mutf8 -ane '{print unless /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-weird.txt + -# produce pronunciations for english +# produce pronunciations for english if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then echo "--- Downloading CMU dictionary ..." - svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ - $dict_dir/cmudict || exit 1; + svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ $dict_dir/cmudict || \ + wget -e robots=off -r -np -nH --cut-dirs=4 -R index.html http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ -P $dict_dir || exit 1 +fi + +if [ ! -f $dict_dir/cmudict/scripts/make_baseform.pl ] ; then + echo "$0: $dict_dir/cmudict/scripts/make_baseform.pl does not exist!"; + exit fi echo "--- Striping stress and pronunciation variant markers from cmudict ..." @@ -54,23 +66,6 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ wc -l $dict_dir/vocab-en-oov.txt wc -l $dict_dir/lexicon-en-iv.txt -pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` -export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages -if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then - echo "--- Downloading Sequitur G2P ..." - echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!" - wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz - tar xf tools/g2p-r1668.tar.gz -C tools - cd tools/g2p - echo '#include ' >> Utility.hh # won't compile on my system w/o this "patch" - python setup.py build - python setup.py install --prefix=. - cd ../.. - if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then - echo "Sequitur G2P is not found - installation failed?" - exit 1 - fi -fi if [ ! -f conf/g2p_model ]; then echo "--- Downloading a pre-trained Sequitur G2P model ..." @@ -82,8 +77,11 @@ if [ ! -f conf/g2p_model ]; then fi echo "--- Preparing pronunciations for OOV words ..." -python tools/g2p/lib/python${pyver}/site-packages/g2p.py \ - --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt +if [ ! -x g2p.py ]; then + echo "g2p.py is not found. Checkout tools/extra/install_sequitur.sh." + exit 1 +fi +g2p.py --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\ sort > $dict_dir/lexicon-en-phn.txt @@ -91,25 +89,25 @@ cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\ -# produce pronunciations for chinese +# produce pronunciations for chinese if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then - wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz + wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz fi cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\ - perl -e ' + perl -e ' while () { @A = split(" ", $_); print $A[1]; for($n = 2; $n < @A; $n++) { - $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; - $tmp = uc($A[$n]); + $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; + $tmp = uc($A[$n]); print " $tmp"; } print "\n"; } - ' | sort -k1 > $dict_dir/ch-dict.txt + ' | sort -k1 > $dict_dir/ch-dict.txt echo "--- Searching for Chinese OOV words ..." gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ @@ -120,22 +118,22 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\ egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt -wc -l $dict_dir/vocab-ch-oov.txt -wc -l $dict_dir/lexicon-ch-iv.txt +wc -l $dict_dir/vocab-ch-oov.txt || true +wc -l $dict_dir/lexicon-ch-iv.txt || true # this unset LC_ALL -# first make sure number of characters and pinyins -# are equal +# first make sure number of characters and pinyins +# are equal cat $dict_dir/ch-dict.txt |\ perl -e ' use encoding utf8; while () { @A = split(" ", $_); $word_len = length($A[0]); - $proun_len = @A - 1 ; + $proun_len = @A - 1 ; if ($word_len == $proun_len) {print $_;} } ' > $dict_dir/ch-dict-1.txt @@ -144,11 +142,12 @@ cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt wc -l $dict_dir/ch-char.txt wc -l $dict_dir/ch-char-pinyin.txt -paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt +paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt + cat $dict_dir/ch-char-dict.txt |\ perl -e ' - my $prev = ""; + my $prev = ""; my $out_line = ""; while () { @A = split(" ", $_); @@ -157,16 +156,16 @@ cat $dict_dir/ch-char-dict.txt |\ #print length($prev); if (length($prev) == 0) { $out_line = $_; chomp($out_line);} if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);} - if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} + if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} $prev = $cur; } - print $out_line; - ' > $dict_dir/ch-char-dict-1.txt + print $out_line; + ' > $dict_dir/ch-char-dict-1.txt cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \ - 'BEGIN{while((getline0) dict[$1]=$2;} + 'BEGIN{while((getline0) dict[$1]=$2;} {printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \ - > $dict_dir/lexicon-ch-oov.txt + > $dict_dir/lexicon-ch-oov.txt cat $dict_dir/lexicon-ch-oov.txt |\ perl -e ' @@ -175,8 +174,8 @@ cat $dict_dir/lexicon-ch-oov.txt |\ while () { @A = split(" ", $_); @entry = (); - push(@entry, $A[0]); - for($i = 1; $i < @A; $i++ ) { + push(@entry, $A[0]); + for($i = 1; $i < @A; $i++ ) { @py = split("/", $A[$i]); @entry1 = @entry; @entry = (); @@ -184,29 +183,29 @@ cat $dict_dir/lexicon-ch-oov.txt |\ for ($k = 0; $k < @py; $k++) { $tmp = $entry1[$j]." ".$py[$k]; push(@entry, $tmp); - } - } + } + } } for ($i = 0; $i < @entry; $i++) { - print $entry[$i]; + print $entry[$i]; print "\n"; - } + } } ' > $dict_dir/lexicon-ch-oov1.txt cat $dict_dir/lexicon-ch-oov1.txt $dict_dir/lexicon-ch-iv.txt |\ - awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt + awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt cat $dict_dir/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\ utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch-cmu.txt -cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu +cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu cat conf/pinyin2cmu | awk -v cmu=$dict_dir/cmu \ 'BEGIN{while((getline $dict_dir/cmu-used cat $dict_dir/cmu | awk -v cmu=$dict_dir/cmu-used \ 'BEGIN{while((getline $dict_dir/cmu-not-used + {if (!dict[$1]) print $1;}' > $dict_dir/cmu-not-used gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $dict_dir/cmu-not-used conf/cmu2pinyin |\ @@ -229,9 +228,9 @@ cat $dict_dir/cmu-py | \ push(@entry, $W); for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); } print "@entry"; - print "\n"; - } -' conf/pinyin2cmu > $dict_dir/cmu-cmu + print "\n"; + } +' conf/pinyin2cmu > $dict_dir/cmu-cmu cat $dict_dir/lexicon-en-phn.txt | \ perl -e ' @@ -248,14 +247,14 @@ cat $dict_dir/lexicon-en-phn.txt | \ @entry = (); $W = shift(@A); push(@entry, $W); - for($i = 0; $i < @A; $i++) { + for($i = 0; $i < @A; $i++) { if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); } else {push(@entry, $A[$i])}; } print "@entry"; - print "\n"; + print "\n"; } -' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt +' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt cat $dict_dir/lexicon-en.txt $dict_dir/lexicon-ch-cmu.txt |\ sort -u > $dict_dir/lexicon1.txt @@ -267,8 +266,8 @@ cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{f while () { $phone = $_; chomp($phone); - chomp($_); - $phone =~ s:([A-Z]+)[0-9]:$1:; + chomp($_); + $phone =~ s:([A-Z]+)[0-9]:$1:; if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } else { $ph_cl{$phone} = [$_]; } } @@ -298,7 +297,5 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", export LC_ALL=C +echo "$0: Done" - - -exit 1; diff --git a/egs/gale_mandarin/s5/local/gale_train_lms.sh b/egs/gale_mandarin/s5/local/gale_train_lms.sh index af429ae2af1..b70bf8de564 100755 --- a/egs/gale_mandarin/s5/local/gale_train_lms.sh +++ b/egs/gale_mandarin/s5/local/gale_train_lms.sh @@ -4,13 +4,13 @@ # To be run from one directory above this script. -lexicon=data/local/dict/lexicon.txt +lexicon=data/local/dict/lexicon.txt [ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1; # check if sri is installed or no sri_installed=false which ngram-count &>/dev/null -if [[ $? == 0 ]]; then +if [[ $? == 0 ]]; then sri_installed=true fi @@ -23,9 +23,9 @@ fi export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. -export PATH=$PATH:./../../../tools/kaldi_lm +export PATH=$PATH:$KALDI_ROOT/tools/kaldi_lm ( # First make sure the kaldi_lm toolkit is installed. - cd ../../../tools || exit 1; + cd $KALDI_ROOT/tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else @@ -45,10 +45,10 @@ dir=data/local/lm mkdir -p $dir text=data/local/train/text [ ! -f $text ] && echo "$0: No such file $text" && exit 1; - + cleantext=$dir/text.no_oov - cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \ > $cleantext || exit 1; @@ -72,20 +72,20 @@ dir=data/local/lm cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ || exit 1; - + train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; # LM is small enough that we don't need to prune it (only about 0.7M N-grams). # Perplexity over 128254.000000 words is 90.446690 # note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz +# data/local/lm/3gram-mincount/lm_unpruned.gz # From here is some commands to do a baseline with SRILM (assuming # you have it installed). -if $sri_installed; then +if $sri_installed; then heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results @@ -101,14 +101,14 @@ if $sri_installed; then ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz - ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout + ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 # Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. # Difference in WSJ must have been due to different treatment of . - ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout + ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout # 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 fi -echo train lm succeeded \ No newline at end of file +echo train lm succeeded diff --git a/egs/gale_mandarin/s5/local/split_wer.sh b/egs/gale_mandarin/s5/local/split_wer.sh deleted file mode 100755 index 38cdb3af991..00000000000 --- a/egs/gale_mandarin/s5/local/split_wer.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash - -# Report WER for reports and conversational -# Copyright 2014 QCRI (author: Ahmed Ali) -# Apache 2.0 - -if [ $# -ne 1 ]; then - echo "Arguments should be the gale folder, see ../run.sh for example." - exit 1; -fi - -[ -f ./path.sh ] && . ./path.sh - - -galeFolder=$(readlink -f $1) -symtab=./data/lang/words.txt - -#split the test set per type: -awk '{print $2}' $galeFolder/all.test | sort -u > $galeFolder/test_id$$ - -# generate the report test set -awk '{print $2}' $galeFolder/report | sort -u > $galeFolder/report_id$$ -comm -1 -2 $galeFolder/test_id$$ $galeFolder/report_id$$ > $galeFolder/report.test - -# generate the conversational test set -awk '{print $2}' $galeFolder/conversational | sort -u > $galeFolder/conversational_id$$ - -comm -1 -2 $galeFolder/test_id$$ $galeFolder/conversational_id$$ > $galeFolder/conversational.test - -rm -fr $galeFolder/test_id$$ $galeFolder/report_id$$ $galeFolder/conversational_id$$ - -min_lmwt=9 -max_lmwt=20 -for dir in exp/*/*decode*; do - for type in report conversational; do - #echo "Processing: $dir $type" - rm -fr $dir/scoring_$type - cp -pr $dir/scoring $dir/scoring_$type - ( cd $dir/scoring_$type; - for x in *.tra test_filt.txt; do - sort -u $x > tmp$$ - join tmp$$ $galeFolder/${type}.test > $x - rm -fr tmp$$ - done - ) - -utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ - cat $dir/scoring_${type}/LMWT.tra \| \ - utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ - compute-wer --text --mode=present \ - ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT -done -done - - -time=$(date +"%Y-%m-%d-%H-%M-%S") -echo "RESULTS generated by $USER at $time" - -echo "Report Results WER:" -for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_report_* | utils/best_wer.sh; done | sort -n -k2 - -echo "Conversational Results WER:" -for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_conversational_* | utils/best_wer.sh; done | sort -n -k2 - -echo "Combined Results for Reports and Conversational WER:" -for x in exp/*/*decode*; do [ -d $x ] && grep WER $x/wer_?? $x/wer_?| utils/best_wer.sh; done | sort -n -k2 - - - - diff --git a/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh new file mode 100755 index 00000000000..7fc51e74846 --- /dev/null +++ b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +# Report WER for reports and conversational +# Copyright 2014 QCRI (author: Ahmed Ali) +# Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Arguments should be the gale folder, see ../run.sh for example." + exit 1; +fi + +[ -f ./path.sh ] && . ./path.sh + +set -o pipefail -e + +galeFolder=$(readlink -f $1) +symtab=./data/lang/words.txt + +min_lmwt=7 +max_lmwt=20 + +for dir in exp/*/*decode*; do + for type in $(ls -1 local/test.* | xargs -n1 basename); do + rm -fr $dir/scoring_$type + mkdir -p $dir/scoring_$type/log + for x in $dir/scoring/*.char $dir/scoring/*.tra $dir/scoring/char.filt $dir/scoring/text.filt; do + cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x) + done + + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ + cat $dir/scoring_${type}/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/text.filt ark,p:- ">&" $dir/wer_${type}_LMWT + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.cer.LMWT.log \ + cat $dir/scoring_${type}/LMWT.char \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/char.filt ark,p:- ">&" $dir/cer_${type}_LMWT + done +done + +for type in $(ls -1 local/test.* | xargs -n1 basename); do + echo -e "\n# WER $type" + for x in exp/*/*decode*; do + grep WER $x/wer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + +for type in $(ls -1 local/test.* | xargs -n1 basename); do + echo -e "\n# CER $type" + for x in exp/*/*decode*; do + grep WER $x/cer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + + + diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S04 b/egs/gale_mandarin/s5/local/test.LDC2013S04 new file mode 100644 index 00000000000..60b3a95110d --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2013S04 @@ -0,0 +1,20 @@ +CCTV4_ACROSSSTRAIT_CMN_20070108_073033 +PHOENIX_NEWSLINE_CMN_20070101_114800 +CCTV4_TDYFOCUS_CMN_20070111_082801 +CCTV2_ECONOMYANDLAW_CMN_20070126_203005 +PHOENIX_BEHINDHL_CMN_20061004_052800 +PHOENIX_NEWSHACK_CMN_20060923_212301 +PHOENIX_NEWSLINE_CMN_20070102_114800 +PHOENIX_ASIANJRNL_CMN_20070313_075800 +PHOENIX_BEHINDHL_CMN_20061012_052800 +PHOENIX_NEWSLINE_CMN_20070105_114800 +CCTV4_TDYFOCUS_CMN_20061023_092800 +PHOENIX_SOCWATCH_CMN_20060928_225801 +PHOENIX_BEHINDHL_CMN_20061011_052800 +CCTVNEWS_TELLITLIKEITIS_CMN_20070114_140701 +CCTV4_TDYFOCUS_CMN_20070104_082800 +PHOENIX_NEWSLINE_CMN_20061020_114800 +PHOENIX_ASIANJRNL_CMN_20061002_085800 +PHOENIX_BEHINDHL_CMN_20070102_052800 +CCTV4_TDYFOCUS_CMN_20070108_082800 +PHOENIX_ASIANJRNL_CMN_20070111_075800 diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S08 b/egs/gale_mandarin/s5/local/test.LDC2013S08 new file mode 100644 index 00000000000..6c0279412e9 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2013S08 @@ -0,0 +1,20 @@ +CCTV4_DAILYNEWS_CMN_20061023_135801 +CCTV4_DAILYNEWS_CMN_20060923_135800 +PHOENIX_PHNXWRLD_CMN_20070101_111800 +CCTV4_NEWS3_CMN_20060921_085800 +CCTV7_MILITARYNEWS1_CMN_20070102_193006 +PHOENIX_PHNXWRLD_CMN_20061024_112500 +CCTV7_MILITARYNEWS1_CMN_20070113_193011 +CCTV4_NEWS3_CMN_20061003_085800 +PHOENIX_PHNXWRLD_CMN_20061019_112401 +CCTV4_DAILYNEWS_CMN_20060920_135800 +PHOENIX_GOODMORNCN_CMN_20060926_185800 +ANHUI_NEWSREVIEW_CMN_20070103_175711 +CCTV4_DAILYNEWS_CMN_20060915_135800 +CCTV4_DAILYNEWS_CMN_20060924_135801 +PHOENIX_PHNXWRLD_CMN_20061018_112400 +CCTV7_MILITARYNEWS1_CMN_20070127_192932 +CCTVNEWS_EVENINGNEWS_CMN_20070123_225701 +CCTV4_NEWS3_CMN_20070116_075800 +PHOENIX_GOODMORNCN_CMN_20060918_185800 +PHOENIX_GOODMORNCN_CMN_20061009_185800 diff --git a/egs/gale_mandarin/s5/local/test.LDC2014S09 b/egs/gale_mandarin/s5/local/test.LDC2014S09 new file mode 100644 index 00000000000..ed871874636 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2014S09 @@ -0,0 +1,20 @@ +CCTV2_BUSINESSHOUR_CMN_20070428_070000 +CCTV1_LEGALREPORT_CMN_20070315_123701 +CCTV1_LEGALREPORT_CMN_20070418_123701 +CCTVNEWS_PEOPLESCONGRESS3_CMN_20070313_085702 +CCTV1_LEGALREPORT_CMN_20070426_123701 +CCTV4_ACROSSSTRAIT_CMN_20070430_073000 +HUBEI_COMMUNICATE_CMN_20070325_013001 +CCTVNEWS_PEOPLEINNEWS_CMN_20070327_215701 +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070315_040701 +CCTV1_LEGALREPORT_CMN_20070416_123701 +CCTV2_PEOPLESCONGRESS1_CMN_20070315_213000 +CCTV2_ECONOMYANDLAW_CMN_20070313_105916 +CCTV1_LEGALREPORT_CMN_20070430_123701 +HUBEI_COMMUNICATE_CMN_20070415_230013 +CCTV2_ECONOMYANDLAW_CMN_20070323_202800 +CCTV1_LEGALREPORT_CMN_20070312_123702 +CCTV1_LEGALREPORT_CMN_20070210_123701 +CCTV4_ACROSSSTRAIT_CMN_20070324_073000 +CCTV4_ACROSSSTRAIT_CMN_20070321_034001 +CCTV2_ECONOMYANDLAW_CMN_20070317_202900 diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S06 b/egs/gale_mandarin/s5/local/test.LDC2015S06 new file mode 100644 index 00000000000..dcdb97b1161 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2015S06 @@ -0,0 +1,14 @@ +CCTV1_LEGALREPORT_CMN_20070407_123702 +CCTV4_ACROSSSTRAIT_CMN_20070704_203000 +CCTV4_ACROSSSTRAIT_CMN_20070402_073000 +CCTV2_ECONOMYANDLAW_CMN_20070402_110000 +CCTV2_BUSINESSHOUR_CMN_20070829_220755 +CCTV1_LEGALREPORT_CMN_20070913_123702 +CCTV4_ACROSSSTRAIT_CMN_20070828_072923 +CCTV1_LEGALREPORT_CMN_20070826_123701 +CCTV4_ACROSSSTRAIT_CMN_20070715_203000 +CCTV4_ACROSSSTRAIT_CMN_20070404_202849 +CCTV2_DIALOG_CMN_20070707_090000 +CCTV1_LEGALREPORT_CMN_20070716_123701 +CCTV1_LEGALREPORT_CMN_20070408_123701 +CCTV4_ACROSSSTRAIT_CMN_20070712_203004 diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S13 b/egs/gale_mandarin/s5/local/test.LDC2015S13 new file mode 100644 index 00000000000..ea52a7679af --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2015S13 @@ -0,0 +1,20 @@ +CCTV2_NEWSLIST_CMN_20070426_115000 +CCTV1_30MINNEWS_CMN_20070418_115702 +CCTV2_NEWSLIST_CMN_20070406_115000 +CCTV1_30MINNEWS_CMN_20070204_115701 +CCTVNEWS_EVENINGNEWS_CMN_20070315_225701 +CCTV1_30MINNEWS_CMN_20070417_115701 +CCTV1_30MINNEWS_CMN_20070208_115701 +CCTV4_NEWS3_CMN_20070327_075800 +CCTV7_MILITARYNEWS1_CMN_20070309_100451 +CCTV7_MILITARYNEWS1_CMN_20070310_093000 +CCTV7_MILITARYNEWS1_CMN_20070411_193000 +CCTV2_NEWSLIST_CMN_20070421_115000 +PHOENIX_PHNXWRLD_CMN_20070801_111801 +VOA_INTNLNEWS_CMN_20070927_210000 +PHOENIX_PHNXWRLD_CMN_20070326_111800 +PHOENIX_PHNXWRLD_CMN_20070821_111801 +CCTV1_30MINNEWS_CMN_20070307_115702 +CCTVNEWS_EVENINGNEWS_CMN_20070314_225701 +VOA_CURRENTEVENTS_CMN_20070807_220000 +CCTV1_30MINNEWS_CMN_20070207_115701 diff --git a/egs/gale_mandarin/s5/local/test.LDC2016S03 b/egs/gale_mandarin/s5/local/test.LDC2016S03 new file mode 100644 index 00000000000..73245ed4c29 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2016S03 @@ -0,0 +1,20 @@ +CCTVNEWS_PEOPLEINNEWS_CMN_20080325_202401 +PHOENIX_ASIANJRNL_CMN_20080725_085800 +VOA_LISTENERSHOTLINE_CMN_20080405_223000 +CCTV1_LEGALREPORT_CMN_20080329_123802 +CCTV2_DIALOG_CMN_20080323_220801 +CCTV2_ECONOMYANDLAW_CMN_20080312_202800 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080310_090602 +VOA_LISTENERSHOTLINE_CMN_20080402_223000 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080306_083701 +CCTV2_ECONOMYANDLAW_CMN_20080428_202802 +CCTV2_ECONOMYANDLAW_CMN_20080324_202802 +VOA_FOCUSDIALOGUE_CMN_20080412_210500 +CCTV4_ACROSSSTRAIT_CMN_20080416_073002 +VOA_STRAITSTALK_CMN_20080407_210500 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080308_083701 +CCTV1_LEGALREPORT_CMN_20080406_123801 +CCTV2_DIALOG_CMN_20080427_220801 +CCTV1_LEGALREPORT_CMN_20080411_123801 +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20080306_122702 +CCTVNEWS_PEOPLEINNEWS_CMN_20080408_202701 diff --git a/egs/gale_mandarin/s5/path.sh b/egs/gale_mandarin/s5/path.sh index be11b34cbc6..e875e4b585c 100755 --- a/egs/gale_mandarin/s5/path.sh +++ b/egs/gale_mandarin/s5/path.sh @@ -1,5 +1,6 @@ export KALDI_ROOT=$(pwd)/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/kaldi_lm:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +. $KALDI_ROOT/tools/env.sh export LC_ALL=C diff --git a/egs/gale_mandarin/s5/run.sh b/egs/gale_mandarin/s5/run.sh index 505ade6a269..74e69e9d12a 100755 --- a/egs/gale_mandarin/s5/run.sh +++ b/egs/gale_mandarin/s5/run.sh @@ -6,66 +6,91 @@ . ./path.sh . ./cmd.sh -nJobs=40 -nDecodeJobs=40 - -AUDIO_PATH=/export/corpora5/LDC/LDC2013S08/ -TEXT_PATH=/export/corpora5/LDC/LDC2013T20/ - +num_jobs=64 +num_jobs_decode=128 + +AUDIO=( + /export/corpora/LDC/LDC2013S08/ + /export/corpora/LDC/LDC2013S04/ + /export/corpora/LDC/LDC2014S09/ + /export/corpora/LDC/LDC2015S06/ + /export/corpora/LDC/LDC2015S13/ + /export/corpora/LDC/LDC2016S03/ +) +TEXT=( + /export/corpora/LDC/LDC2013T20/ + /export/corpora/LDC/LDC2013T08/ + /export/corpora/LDC/LDC2014T28/ + /export/corpora/LDC/LDC2015T09/ + /export/corpora/LDC/LDC2015T25/ + /export/corpora/LDC/LDC2016T12/ +) galeData=GALE/ # You can run the script from here automatically, but it is recommended to run the data preparation, # and features extraction manually and and only once. # By copying and pasting into the shell. -local/gale_data_prep_audio.sh $galeData $AUDIO_PATH - -local/gale_data_prep_txt.sh $galeData $TEXT_PATH +set -e -o pipefail +set -x -local/gale_data_prep_split.sh $galeData +local/gale_data_prep_audio.sh "${AUDIO[@]}" $galeData -local/gale_prep_dict.sh +local/gale_data_prep_txt.sh "${TEXT[@]}" $galeData -utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang +local/gale_data_prep_split.sh $galeData +local/gale_prep_dict.sh + +utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang local/gale_train_lms.sh -local/gale_format_data.sh +local/gale_format_data.sh # Now make MFCC features. # mfccdir should be some place with a largish disk where you # want to store MFCC features. mfccdir=mfcc +# spread the mfccs over various machines, as this data-set is quite large. +if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then + mfcc=$(basename mfccdir) # in case was absolute pathname (unlikely), get basename. + utils/create_split_dir.pl /export/b{05,06,07,08}/$USER/kaldi-data/egs/gale_mandarin/s5/$mfcc/storage \ + $mfccdir/storage +fi + for x in train dev ; do - steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nJobs \ + utils/fix_data_dir.sh data/$x + steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $num_jobs \ data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons steps/compute_cmvn_stats.sh data/$x exp/make_mfcc/$x $mfccdir done # Let's create a subset with 10k segments to make quick flat-start training: -utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; +utils/subset_data_dir.sh data/train 10000 data/train.10k || exit 1; +utils/subset_data_dir.sh data/train 50000 data/train.50k || exit 1; +utils/subset_data_dir.sh data/train 100000 data/train.100k || exit 1; # Train monophone models on a subset of the data, 10K segment # Note: the --boost-silence option should probably be omitted by default steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ - data/train.10K data/lang exp/mono || exit 1; + data/train.10k data/lang exp/mono || exit 1; # Get alignments from monophone system. -steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; +steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train.50k data/lang exp/mono exp/mono_ali.50k || exit 1; # train tri1 [first triphone pass] steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + 2500 30000 data/train.50k data/lang exp/mono_ali.50k exp/tri1 || exit 1; # First triphone decoding -utils/mkgraph.sh data/lang_dev exp/tri1 exp/tri1/graph || exit 1; -steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; +steps/decode.sh --nj $num_jobs_decode --cmd "$decode_cmd" \ exp/tri1/graph data/dev exp/tri1/decode & -steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ +steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri1 exp/tri1_ali || exit 1; # Train tri2a, which is deltas+delta+deltas @@ -73,111 +98,110 @@ steps/train_deltas.sh --cmd "$train_cmd" \ 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; # tri2a decoding -utils/mkgraph.sh data/lang_dev exp/tri2a exp/tri2a/graph || exit 1; -steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ +utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph || exit 1; +steps/decode.sh --nj $num_jobs_decode --cmd "$decode_cmd" \ exp/tri2a/graph data/dev exp/tri2a/decode & +steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ + data/train data/lang exp/tri2a exp/tri2a_ali || exit 1; + # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ - data/train data/lang exp/tri1_ali exp/tri2b || exit 1; -utils/mkgraph.sh data/lang_dev exp/tri2b exp/tri2b/graph || exit 1; -steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode & + data/train data/lang exp/tri2a_ali exp/tri2b || exit 1; +utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1; +steps/decode.sh --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode & # Align all data with LDA+MLLT system (tri2b) -steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ +steps/align_si.sh --nj $num_jobs --cmd "$train_cmd" \ --use-graphs true data/train data/lang exp/tri2b exp/tri2b_ali || exit 1; # Do MMI on top of LDA+MLLT. -steps/make_denlats.sh --nj $nJobs --cmd "$train_cmd" \ +steps/make_denlats.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1; - + steps/train_mmi.sh data/train data/lang exp/tri2b_ali \ - exp/tri2b_denlats exp/tri2b_mmi + exp/tri2b_denlats exp/tri2b_mmi -steps/decode.sh --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ +steps/decode.sh --iter 4 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi/decode_it4 & -steps/decode.sh --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ +steps/decode.sh --iter 3 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi/decode_it3 & # Do the same with boosting. steps/train_mmi.sh --boost 0.1 data/train data/lang exp/tri2b_ali \ -exp/tri2b_denlats exp/tri2b_mmi_b0.1 +exp/tri2b_denlats exp/tri2b_mmi_b0.1 -steps/decode.sh --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ - data/dev exp/tri2b_mmi_b0.1/decode_it4 & -steps/decode.sh --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ +steps/decode.sh --iter 4 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \ + data/dev exp/tri2b_mmi_b0.1/decode_it4 & +steps/decode.sh --iter 3 --nj $num_jobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi_b0.1/decode_it3 & # Do MPE. steps/train_mpe.sh data/train data/lang exp/tri2b_ali exp/tri2b_denlats exp/tri2b_mpe || exit 1; -steps/decode.sh --iter 4 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph \ +steps/decode.sh --iter 4 --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mpe/decode_it4 & -steps/decode.sh --iter 3 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph \ +steps/decode.sh --iter 3 --nj $num_jobs_decode --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mpe/decode_it3 & # From 2b system, train 3b which is LDA + MLLT + SAT. steps/train_sat.sh --cmd "$train_cmd" \ 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; -utils/mkgraph.sh data/lang_dev exp/tri3b exp/tri3b/graph|| exit 1; -steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ +utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph|| exit 1; +steps/decode_fmllr.sh --nj $num_jobs_decode --cmd "$decode_cmd" \ exp/tri3b/graph data/dev exp/tri3b/decode & # From 3b system, align all data. -steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \ +steps/align_fmllr.sh --nj $num_jobs --cmd "$train_cmd" \ data/train data/lang exp/tri3b exp/tri3b_ali || exit 1; ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights" steps/train_ubm.sh --cmd "$train_cmd" 700 \ data/train data/lang exp/tri3b_ali exp/ubm5a || exit 1; - + steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3b_ali \ exp/ubm5a/final.ubm exp/sgmm_5a || exit 1; -utils/mkgraph.sh data/lang_dev exp/sgmm_5a exp/sgmm_5a/graph || exit 1; - -steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \ +utils/mkgraph.sh data/lang_test exp/sgmm_5a exp/sgmm_5a/graph || exit 1; +steps/decode_sgmm2.sh --nj $num_jobs_decode --cmd "$decode_cmd" --config conf/decode.config \ --transform-dir exp/tri3b/decode exp/sgmm_5a/graph data/dev exp/sgmm_5a/decode & -steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_ali \ +steps/align_sgmm2.sh --nj $num_jobs --cmd "$train_cmd" --transform-dir exp/tri3b_ali \ --use-graphs true --use-gselect true data/train data/lang exp/sgmm_5a exp/sgmm_5a_ali || exit 1; ## boosted MMI on SGMM -steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split 30 --beam 9.0 --lattice-beam 6 \ - --cmd "$decode_cmd" --transform-dir \ - exp/tri3b_ali data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1; - +steps/make_denlats_sgmm2.sh --nj $num_jobs --sub-split $num_jobs --beam 9.0 --lattice-beam 6 \ + --cmd "$decode_cmd" --num-threads 4 --transform-dir exp/tri3b_ali \ + data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1; + steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tri3b_ali --boost 0.1 \ data/train data/lang exp/sgmm_5a exp/sgmm_5a_denlats exp/sgmm_5a_mmi_b0.1 - + #decode GMM MMI -utils/mkgraph.sh data/lang_dev exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1; +utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1; + +steps/decode_sgmm2.sh --nj $num_jobs_decode --cmd "$decode_cmd" --config conf/decode.config \ + --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode -steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \ - --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode & - for n in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \ + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \ data/dev exp/sgmm_5a_mmi_b0.1/decode exp/sgmm_5a_mmi_b0.1/decode$n - - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \ - data/dev exp/sgmm_5a/decode exp/sgmm_5a_mmi_onlyRescoreb0.1/decode$n done -local/nnet/run_dnn.sh +wait +#local/nnet/run_dnn.sh -time=$(date +"%Y-%m-%d-%H-%M-%S") -#get WER -for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_* | utils/best_wer.sh; \ -done | sort -n -r -k2 > RESULTS.$USER.$time # to make sure you keep the results timed and owned +echo "# Get WER and CER" > RESULTS +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/wer_[0-9]* | utils/best_wer.sh; \ +done | sort -n -r -k2 >> RESULTS +echo "" >> RESULTS +for x in exp/*/decode*; do [ -d $x ] && grep WER $x/cer_[0-9]* | utils/best_wer.sh; \ +done | sort -n -r -k2 >> RESULTS -#get detailed WER; reports, conversational and combined -local/split_wer.sh $galeData > RESULTS.details.$USER.$time +echo -e "\n# Detailed WER on all corpus dev sets" >> RESULTS +local/split_wer_per_corpus.sh $galeData >> RESULTS echo training succedded exit 0 - - - diff --git a/egs/wsj/s5/utils/pinyin_map.pl b/egs/wsj/s5/utils/pinyin_map.pl index 65b260e2418..8210ec2af51 100755 --- a/egs/wsj/s5/utils/pinyin_map.pl +++ b/egs/wsj/s5/utils/pinyin_map.pl @@ -10,7 +10,7 @@ open(MAPS, $ARGV[0]) or die("Could not open pinyin map file."); my %py2ph; foreach $line () { @A = split(" ", $line); $py = shift(@A); - $py2ph{$py} = [@A]; + $py2ph{$py} = [@A]; } #foreach $word ( keys %py2ph ) { @@ -25,14 +25,14 @@ while () { @A = split(" ", $_); - @entry = (); + @entry = (); $W = shift(@A); push(@entry, $W); for($i = 0; $i < @A; $i++) { $initial= $A[$i]; $final = $A[$i]; #print $initial, " ", $final, "\n"; if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} + elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;} @@ -58,22 +58,22 @@ $tone = $final; $final =~ s:([A-Z]+)[0-9]:$1:; $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { print "1: no entry find for ", $A[$i], " ", $initial, " ", $final; exit;} - push(@entry, @{$py2ph{$initial}}); + if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { die "$0: no entry find for ", $A[$i], " ", $initial, " ", $final;} + push(@entry, @{$py2ph{$initial}}); @tmp = @{$py2ph{$final}}; for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); + push(@entry, @tmp); } else { $tone = $A[$i]; - $A[$i] =~ s:([A-Z]+)[0-9]:$1:; + $A[$i] =~ s:([A-Z]+)[0-9]:$1:; $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$A[$i]})) { print "2: no entry find for ", $A[$i]; exit;} + if (!(exists $py2ph{$A[$i]})) { die "$0: no entry find for ", $A[$i];} @tmp = @{$py2ph{$A[$i]}}; for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); + push(@entry, @tmp); } - } + } print "@entry"; print "\n"; }