diff --git a/egs/gale_arabic/s5/local/split_wer_per_corpus.sh b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh new file mode 100755 index 00000000000..71c8adcc3fe --- /dev/null +++ b/egs/gale_arabic/s5/local/split_wer_per_corpus.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Report WER for reports and conversational +# Copyright 2014 QCRI (author: Ahmed Ali) +# Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Arguments should be the gale folder, see ../run.sh for example." + exit 1; +fi + +[ -f ./path.sh ] && . ./path.sh + +#set -o pipefail -e + +galeFolder=$(readlink -f $1) +symtab=./data/lang/words.txt + +min_lmwt=7 +max_lmwt=20 + +for dir in exp/*/*decode*; do + for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + #echo "Processing: $dir $type" + rm -fr $dir/scoring_$type + mkdir -p $dir/scoring_$type/log + for x in $dir/scoring/*.tra $dir/scoring/test_filt.txt; do + cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x) + done + + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ + cat $dir/scoring_${type}/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT +done +done + +time=$(date +"%Y-%m-%d-%H-%M-%S") +echo "#RESULTS splits generated by $USER at $time" + +for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + echo -e "\n# WER $type" + for x in exp/*/*decode*; do + grep WER $x/wer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + + + + diff --git a/egs/gale_mandarin/s5/RESULTS b/egs/gale_mandarin/s5/RESULTS new file mode 100644 index 00000000000..47974d88975 --- /dev/null +++ b/egs/gale_mandarin/s5/RESULTS @@ -0,0 +1,97 @@ +#RESULTS splits generated by jtrmal1@jhu.edu at 2016-11-21-12-05-54 + +# WER test.LDC2013S04 +%WER 42.23 [ 40179 / 95137, 5329 ins, 8769 del, 26081 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10 +%WER 43.81 [ 41682 / 95137, 5469 ins, 9213 del, 27000 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13 +%WER 49.06 [ 46677 / 95137, 5459 ins, 10672 del, 30546 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13 +%WER 50.53 [ 48073 / 95137, 5505 ins, 11022 del, 31546 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12 +%WER 51.47 [ 48971 / 95137, 5103 ins, 12391 del, 31477 sub ] exp/tri2a/decode/wer_test.LDC2013S04_13 +%WER 53.30 [ 50708 / 95137, 4829 ins, 13624 del, 32255 sub ] exp/tri1/decode/wer_test.LDC2013S04_13 + +# WER test.LDC2013S08 +%WER 26.01 [ 20781 / 79911, 3764 ins, 3034 del, 13983 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_8 +%WER 27.43 [ 21917 / 79911, 3644 ins, 3544 del, 14729 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13 +%WER 31.24 [ 24968 / 79911, 3820 ins, 3943 del, 17205 sub ] exp/tri2b/decode/wer_test.LDC2013S08_12 +%WER 32.45 [ 25932 / 79911, 3816 ins, 4112 del, 18004 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11 +%WER 34.22 [ 27349 / 79911, 3677 ins, 5034 del, 18638 sub ] exp/tri2a/decode/wer_test.LDC2013S08_13 +%WER 35.88 [ 28676 / 79911, 3715 ins, 5127 del, 19834 sub ] exp/tri1/decode/wer_test.LDC2013S08_12 + +# WER test.LDC2014S09 +%WER 50.54 [ 39383 / 77932, 10535 ins, 7593 del, 21255 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12 +%WER 52.14 [ 40634 / 77932, 10271 ins, 8530 del, 21833 sub ] exp/tri3b/decode/wer_test.LDC2014S09_17 +%WER 56.57 [ 44085 / 77932, 9394 ins, 10954 del, 23737 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16 +%WER 57.95 [ 45158 / 77932, 8777 ins, 12547 del, 23834 sub ] exp/tri2a/decode/wer_test.LDC2014S09_15 +%WER 58.19 [ 45347 / 77932, 9712 ins, 10831 del, 24804 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15 +%WER 59.38 [ 46277 / 77932, 7944 ins, 14560 del, 23773 sub ] exp/tri1/decode/wer_test.LDC2014S09_16 + +# WER test.LDC2015S06 +%WER 46.22 [ 28480 / 61612, 8454 ins, 5015 del, 15011 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_9 +%WER 48.08 [ 29624 / 61612, 8471 ins, 5669 del, 15484 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13 +%WER 52.67 [ 32450 / 61612, 8425 ins, 6441 del, 17584 sub ] exp/tri2b/decode/wer_test.LDC2015S06_12 +%WER 53.51 [ 32968 / 61612, 8444 ins, 6576 del, 17948 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_11 +%WER 55.08 [ 33936 / 61612, 8031 ins, 7811 del, 18094 sub ] exp/tri2a/decode/wer_test.LDC2015S06_13 +%WER 56.70 [ 34937 / 61612, 7890 ins, 8531 del, 18516 sub ] exp/tri1/decode/wer_test.LDC2015S06_13 + +# WER test.LDC2015S13 +%WER 23.35 [ 19752 / 84594, 2196 ins, 3274 del, 14282 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9 +%WER 24.81 [ 20984 / 84594, 2214 ins, 3600 del, 15170 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12 +%WER 28.62 [ 24211 / 84594, 2306 ins, 4186 del, 17719 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12 +%WER 30.03 [ 25405 / 84594, 2106 ins, 4617 del, 18682 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12 +%WER 30.58 [ 25869 / 84594, 2142 ins, 4798 del, 18929 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12 +%WER 32.16 [ 27206 / 84594, 1958 ins, 5681 del, 19567 sub ] exp/tri1/decode/wer_test.LDC2015S13_13 + +# WER test.LDC2016S03 +%WER 53.04 [ 77015 / 145212, 34385 ins, 9733 del, 32897 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_12 +%WER 54.68 [ 79399 / 145212, 34634 ins, 10414 del, 34351 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17 +%WER 58.99 [ 85661 / 145212, 33946 ins, 12904 del, 38811 sub ] exp/tri2b/decode/wer_test.LDC2016S03_16 +%WER 59.80 [ 86841 / 145212, 34387 ins, 12610 del, 39844 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_15 +%WER 60.29 [ 87547 / 145212, 31358 ins, 15266 del, 40923 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16 +%WER 61.75 [ 89662 / 145212, 30628 ins, 16992 del, 42042 sub ] exp/tri1/decode/wer_test.LDC2016S03_16 + +# CER test.LDC2013S04 +%WER 33.93 [ 51673 / 152279, 7241 ins, 12180 del, 32252 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_10 +%WER 35.31 [ 53769 / 152279, 7813 ins, 11593 del, 34363 sub ] exp/tri3b/decode/cer_test.LDC2013S04_11 +%WER 40.56 [ 61767 / 152279, 8062 ins, 13321 del, 40384 sub ] exp/tri2b/decode/cer_test.LDC2013S04_11 +%WER 42.08 [ 64081 / 152279, 8052 ins, 13940 del, 42089 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10 +%WER 43.22 [ 65818 / 152279, 7602 ins, 15416 del, 42800 sub ] exp/tri2a/decode/cer_test.LDC2013S04_11 +%WER 44.93 [ 68413 / 152279, 7255 ins, 16855 del, 44303 sub ] exp/tri1/decode/cer_test.LDC2013S04_11 + +# CER test.LDC2013S08 +%WER 19.18 [ 25398 / 132434, 4773 ins, 3650 del, 16975 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8 +%WER 20.54 [ 27201 / 132434, 4792 ins, 4037 del, 18372 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11 +%WER 24.12 [ 31943 / 132434, 4817 ins, 4968 del, 22158 sub ] exp/tri2b/decode/cer_test.LDC2013S08_12 +%WER 25.15 [ 33309 / 132434, 4839 ins, 5019 del, 23451 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11 +%WER 26.90 [ 35623 / 132434, 4725 ins, 6057 del, 24841 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12 +%WER 28.45 [ 37674 / 132434, 4506 ins, 6690 del, 26478 sub ] exp/tri1/decode/cer_test.LDC2013S08_12 + +# CER test.LDC2014S09 +%WER 42.24 [ 53240 / 126027, 16007 ins, 10270 del, 26963 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11 +%WER 43.81 [ 55212 / 126027, 15435 ins, 11971 del, 27806 sub ] exp/tri3b/decode/cer_test.LDC2014S09_15 +%WER 48.72 [ 61395 / 126027, 14667 ins, 15066 del, 31662 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14 +%WER 50.20 [ 63270 / 126027, 15105 ins, 14701 del, 33464 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_13 +%WER 50.37 [ 63481 / 126027, 13343 ins, 18289 del, 31849 sub ] exp/tri2a/decode/cer_test.LDC2014S09_14 +%WER 51.95 [ 65470 / 126027, 12613 ins, 20231 del, 32626 sub ] exp/tri1/decode/cer_test.LDC2014S09_14 + +# CER test.LDC2015S06 +%WER 38.57 [ 38234 / 99132, 12510 ins, 7120 del, 18604 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9 +%WER 40.30 [ 39954 / 99132, 12593 ins, 7986 del, 19375 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12 +%WER 44.83 [ 44438 / 99132, 12639 ins, 8903 del, 22896 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11 +%WER 45.71 [ 45318 / 99132, 12631 ins, 9164 del, 23523 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10 +%WER 47.39 [ 46983 / 99132, 12432 ins, 9935 del, 24616 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11 +%WER 49.03 [ 48600 / 99132, 12250 ins, 10831 del, 25519 sub ] exp/tri1/decode/cer_test.LDC2015S06_11 + +# CER test.LDC2015S13 +%WER 17.05 [ 23993 / 140702, 2450 ins, 3594 del, 17949 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_8 +%WER 18.39 [ 25872 / 140702, 2257 ins, 4274 del, 19341 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11 +%WER 21.98 [ 30933 / 140702, 2347 ins, 4784 del, 23802 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11 +%WER 23.23 [ 32679 / 140702, 2197 ins, 5383 del, 25099 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_11 +%WER 23.88 [ 33596 / 140702, 2030 ins, 6225 del, 25341 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12 +%WER 25.47 [ 35842 / 140702, 1944 ins, 6979 del, 26919 sub ] exp/tri1/decode/cer_test.LDC2015S13_12 + +# CER test.LDC2016S03 +%WER 45.40 [ 106787 / 235216, 53964 ins, 12519 del, 40304 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_11 +%WER 46.75 [ 109953 / 235216, 54007 ins, 13639 del, 42307 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15 +%WER 51.08 [ 120139 / 235216, 53593 ins, 16514 del, 50032 sub ] exp/tri2b/decode/cer_test.LDC2016S03_14 +%WER 51.97 [ 122235 / 235216, 52763 ins, 17940 del, 51532 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_15 +%WER 52.61 [ 123739 / 235216, 47836 ins, 22637 del, 53266 sub ] exp/tri2a/decode/cer_test.LDC2016S03_16 +%WER 54.06 [ 127163 / 235216, 47776 ins, 23865 del, 55522 sub ] exp/tri1/decode/cer_test.LDC2016S03_15 diff --git a/egs/gale_mandarin/s5/conf/decode.config b/egs/gale_mandarin/s5/conf/decode.config new file mode 100644 index 00000000000..e69de29bb2d diff --git a/egs/gale_mandarin/s5/local/bad_utts b/egs/gale_mandarin/s5/local/bad_utts new file mode 100644 index 00000000000..6683c9a97a5 --- /dev/null +++ b/egs/gale_mandarin/s5/local/bad_utts @@ -0,0 +1,12 @@ +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070308_040701 +CCTV2_ECONOMYANDLAW_CMN_20070426_202800 +CCTV2_ECONOMYANDLAW_CMN_20070426_202800(1) +CCTV2_LIANGHUI_PROBLEM_20070308_213000 +CCTV4_TDYFOCUS_CMN_20070824_092801 +VOA_ISSUESANDOPINIONS_CMN_20070801_210500 +VOA_ISSUESANDOPINIONS_CMN_20070926_210500 +VOA_LISTENERSHOTLINE_CMN_20070906_223000 +VOA_LISTENERSHOTLINE_CMN_20070926_223000 +VOA_LISTENERSHOTLINE_CMN_20070927_223000 +PHOENIX_NEWSLINE_CMN_20070101_114800 +PHOENIX_NEWSLINE_CMN_20070101_114800(1) diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh index c2d61cfb864..598c6b37c17 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_audio.sh @@ -1,46 +1,69 @@ -#!/bin/bash +#!/bin/bash # Copyright 2014 QCRI (author: Ahmed Ali) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 -if [ $# -ne 2 ]; then - echo "Arguments should be the "; exit 1 -fi +echo $0 "$@" + +galeData=$(readlink -f "${@: -1}" ); +wavedir=$galeData/wav +mkdir -p $wavedir + -# check that sox is installed +length=$(($#-1)) +args=${@:1:$length} +# check that sox is installed which sox &>/dev/null -if [[ $? != 0 ]]; then - echo "sox is not installed" - exit 1 +if [[ $? != 0 ]]; then + echo "$0: sox is not installed" + exit 1 fi -galeData=$1 -wavedir=$galeData/wav -mkdir -p $wavedir +set -e -o pipefail + +for var in $args; do + CD=$(basename $var) + [ -d $wavedir/$CD ] && rm -rf $wavedir/$CD + mkdir -p $wavedir/$CD + find $var -type f -name *.wav | while read file; do + f=$(basename $file) + if [[ ! -L "$wavedir/$CD/$f" ]]; then + ln -sf $file $wavedir/$CD/$f + fi + done -audio_path=$2 - -mkdir -p $wavedir/ - -#copy and convert the flac to wav -find $audio_path -type f -name *.flac | while read file; do - f_name=$(basename $file) - if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then - echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" " - sox $file $wavedir/"${f_name%.flac}.wav" - fi - + #make an flac symmlink as well + find $var -type f -name *.flac | while read file; do + f=$(basename $file) + + if [[ ! -L "$wavedir/$CD/$f" ]]; then + ln -sf $file $wavedir/$CD/$f + fi + done done -find $wavedir -name *.wav > $galeData/wav$$ -awk -F "/" '{print $NF}' $galeData/wav$$ | sed 's:\.wav::' > $galeData/id$$ -paste -d ' ' $galeData/id$$ $galeData/wav$$ | sort -u > $galeData/wav.scp +#figure out the proper sox command line +#the flac will be converted on the fly +( + for w in `find $wavedir -name *.wav` ; do + base=`basename $w .wav` + fullpath=`readlink -f $w` + echo "$base sox $fullpath -r 16000 -t wav - |" + done + + for w in `find $wavedir -name *.flac` ; do + base=`basename $w .flac` + fullpath=`readlink -f $w` + echo "$base sox $fullpath -r 16000 -t wav - |" + done +) | sort -u > $galeData/wav.scp -#clean +#clean rm -fr $galeData/id$$ $galeData/wav$$ -echo data prep audio succeded +echo "$0: data prep audio succeded" exit 0 diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh index 63b6d8d2f7b..40c29415a1e 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_split.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_split.sh @@ -1,37 +1,33 @@ -#!/bin/bash +#!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 if [ $# -ne 1 ]; then echo "Arguments should be the "; exit 1 fi +set -e -o pipefail #data will data/local galeData=$(readlink -f $1) mkdir -p data/local dir=$(readlink -f data/local) -cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist - -cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled - -# we want about 6h dev data; 300 is manually chosen -cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev - - -cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list # some problem with the text data; same utt id but different transcription -cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list +cat $galeData/all | awk '{print$2}' | \ + sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list -utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup +utils/filter_scp.pl --exclude -f 2 \ + $galeData/dup.list $galeData/all > $galeData/all.nodup -mv $galeData/all_nodup $galeData/all +mv $galeData/all $galeData/all.orig +mv $galeData/all.nodup $galeData/all -utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev -utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train +grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev +grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list @@ -46,11 +42,11 @@ utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt for x in dev train; do outdir=$dir/$x - file=$galeData/all.$x + file=$galeData/all.$x mkdir -p $outdir awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text -done +done cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list @@ -60,5 +56,6 @@ utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline0) {seen[$2]=1;}} {if (seen[$1]) { print $0}}' > $dir/train/wav.scp - + + echo data prep split succeeded diff --git a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh index 3fe32055f6c..7e3e57c92a8 100755 --- a/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh +++ b/egs/gale_mandarin/s5/local/gale_data_prep_txt.sh @@ -1,28 +1,38 @@ #!/bin/bash # Copyright 2014 (author: Ahmed Ali, Hainan Xu) +# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal) # Apache 2.0 -if [ $# -ne 2 ]; then - echo "Arguments should be the "; exit 1 -fi - +echo $0 "$@" export LC_ALL=C -galeData=$1 -text=$2 +galeData=$(readlink -f "${@: -1}" ); -cur=`pwd` +length=$(($#-1)) +args=${@:1:$length} -txtdir=$galeData/txt -mkdir -p $galeData/txt +top_pwd=`pwd` +txtdir=$galeData/txt +mkdir -p $txtdir cd $txtdir -find $text -type f -name *.tdf | while read file; do -sed '1,3d' $file -done > all.tmp +for cdx in ${args[@]}; do + echo "Preparing $cdx" + if [[ $cdx == *.tgz ]] ; then + tar -xvf $cdx + elif [ -d "$cdx" ]; then + tgt=$(basename $cdx) + test -x $tgt || ln -s $cdx `basename $tgt` + else + echo "I don't really know what I shall do with $cdx " >&2 + fi +done +find -L . -type f -name *.tdf | while read file; do +sed '1,3d' $file +done > all.tmp perl -e ' ($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0]; @@ -34,22 +44,35 @@ perl -e ' while () { @arr= split /\t/,$_; $arr[4] =~ s/ //g; + $arr[4] = sprintf("%020s", $arr[4]); $spkid = "$arr[0]_$arr[4]"; - $spkfix = sprintf("%060s", $spkid); - $start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning - $end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//; - $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n"; - next if ($rStart == $rEnd); - $id =~ s/.sph//g; - print ID $id; + $spkfix = sprintf("%080s", $spkid); + + $start=sprintf ("%0.3f",$arr[2]); + $rStart=$start; + $start=~s/\.//; + $start=~s/^0+$/0/; + $start=~s/^0+([^0])/$1/; # remove zeros at the beginning + $start = sprintf("%09s", $start); + + $end=sprintf ("%0.3f",$arr[3]); + $rEnd=$end; + $end=~s/^0+([^0])/$1/; + $end=~s/\.//; + $end = sprintf("%09s", $end); + + $id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n"; + next if ($rStart == $rEnd); + $id =~ s/.sph//g; + print ID $id; print TXT "$arr[7]\n"; print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n"; print MAP "$arr[0] ${spkfix}_$arr[0]\n"; - }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" + }' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp" perl -p -i -e 's=/.$==g' contentall.tmp -cd $cur +cd $top_pwd pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` @@ -57,11 +80,11 @@ export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-pa if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then echo "--- Downloading mmseg-1.3.0 ..." echo "NOTE: it assumes that you have Python, Setuptools installed on your system!" - wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz + wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz tar xf tools/mmseg-1.3.0.tar.gz -C tools cd tools/mmseg-1.3.0 mkdir -p lib/python${pyver}/site-packages - python setup.py build + CC=gcc CXX=g++ python setup.py build python setup.py install --prefix=. cd ../.. if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then @@ -90,11 +113,8 @@ awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk cat $txtdir/map.tmp | sort -u > $txtdir/../map -sort -c $txtdir/../utt2spk +sort -c $txtdir/../utt2spk utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt -cd ..; -rm -fr $txtdir - echo data prep text succeeded diff --git a/egs/gale_mandarin/s5/local/gale_format_data.sh b/egs/gale_mandarin/s5/local/gale_format_data.sh index 71187e89a12..204fa31fd42 100755 --- a/egs/gale_mandarin/s5/local/gale_format_data.sh +++ b/egs/gale_mandarin/s5/local/gale_format_data.sh @@ -8,19 +8,20 @@ if [ -f path.sh ]; then echo "missing path.sh"; exit 1; fi +set -e -o pipefail +set -x + for dir in dev train; do - cp -pr data/local/$dir data/$dir + cp -prT data/local/$dir data/$dir done export LC_ALL=C -mkdir -p data/lang_dev - arpa_lm=data/local/lm/3gram-mincount/lm_unpruned.gz [ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1; -rm -r data/lang_dev -cp -r data/lang data/lang_dev +rm -r data/lang_test || true +cp -r data/lang data/lang_test gunzip -c "$arpa_lm" | \ arpa2fst --disambig-symbol=#0 \ @@ -28,31 +29,35 @@ gunzip -c "$arpa_lm" | \ echo "Checking how stochastic G is (the first of these numbers should be small):" -fstisstochastic data/lang_dev/G.fst +fstisstochastic data/lang_test/G.fst || true ## Check lexicon. ## just have a look and make sure it seems sane. echo "First few lines of lexicon FST:" -fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head - +( + fstprint --isymbols=data/lang/phones.txt --osymbols=data/lang/words.txt data/lang/L.fst | head +) || true echo Performing further checks # Checking that G.fst is determinizable. -fstdeterminize data/lang_dev/G.fst /dev/null || echo Error determinizing G. +fstdeterminize data/lang_test/G.fst /dev/null || { + echo Error determinizing G. + exit 1 +} # Checking that L_disambig.fst is determinizable. -fstdeterminize data/lang_dev/L_disambig.fst /dev/null || echo Error determinizing L. +fstdeterminize data/lang_test/L_disambig.fst /dev/null || echo Error determinizing L. # Checking that disambiguated lexicon times G is determinizable # Note: we do this with fstdeterminizestar not fstdeterminize, as # fstdeterminize was taking forever (presumbaly relates to a bug # in this version of OpenFst that makes determinization slow for # some case). -fsttablecompose data/lang_dev/L_disambig.fst data/lang_dev/G.fst | \ +fsttablecompose data/lang_test/L_disambig.fst data/lang_test/G.fst | \ fstdeterminizestar >/dev/null || echo Error # Checking that LG is stochastic: -fsttablecompose data/lang/L_disambig.fst data/lang_dev/G.fst | \ +fsttablecompose data/lang/L_disambig.fst data/lang_test/G.fst | \ fstisstochastic || echo LG is not stochastic diff --git a/egs/gale_mandarin/s5/local/gale_prep_dict.sh b/egs/gale_mandarin/s5/local/gale_prep_dict.sh index cd3ed602c70..cb3f1b56cba 100755 --- a/egs/gale_mandarin/s5/local/gale_prep_dict.sh +++ b/egs/gale_mandarin/s5/local/gale_prep_dict.sh @@ -1,13 +1,14 @@ #!/bin/bash # prepare dictionary for HKUST -# it is done for English and Chinese separately, +# it is done for English and Chinese separately, # For English, we use CMU dictionary, and Sequitur G2P # for OOVs, while all englist phone set will concert to Chinese # phone set at the end. For Chinese, we use an online dictionary, # for OOV, we just produce pronunciation using Charactrt Mapping. - -. path.sh +. ./path.sh + +set -e -o pipefail [ $# != 0 ] && echo "Usage: local/hkust_prepare_dict.sh" && exit 1; train_dir=data/local/train @@ -23,18 +24,29 @@ esac # extract full vocabulary cat $train_dir/text $dev_dir/text | awk '{for (i = 2; i <= NF; i++) print $i}' |\ - sed -e 's/ /\n/g' | sort -u | grep -v '\[LAUGHTER\]' | grep -v '\[NOISE\]' |\ - grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt + sed -e 's/ /\n/g' | sort -u | \ + grep -v '\[LAUGHTER\]' | \ + grep -v '\[NOISE\]' |\ + grep -v '\[VOCALIZED-NOISE\]' > $dict_dir/vocab-full.txt # split into English and Chinese cat $dict_dir/vocab-full.txt | grep '[a-zA-Z]' > $dict_dir/vocab-en.txt -cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' > $dict_dir/vocab-ch.txt +cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \ + perl -CSD -Mutf8 -ane '{print if /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-ch.txt +cat $dict_dir/vocab-full.txt | grep -v '[a-zA-Z]' | \ + perl -CSD -Mutf8 -ane '{print unless /^\p{InCJK_Unified_Ideographs}+$/;}' > $dict_dir/vocab-weird.txt + -# produce pronunciations for english +# produce pronunciations for english if [ ! -f $dict_dir/cmudict/cmudict.0.7a ]; then echo "--- Downloading CMU dictionary ..." - svn co https://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict \ - $dict_dir/cmudict || exit 1; + svn co http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ $dict_dir/cmudict || \ + wget -e robots=off -r -np -nH --cut-dirs=4 -R index.html http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/ -P $dict_dir || exit 1 +fi + +if [ ! -f $dict_dir/cmudict/scripts/make_baseform.pl ] ; then + echo "$0: $dict_dir/cmudict/scripts/make_baseform.pl does not exist!"; + exit fi echo "--- Striping stress and pronunciation variant markers from cmudict ..." @@ -54,23 +66,6 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ wc -l $dict_dir/vocab-en-oov.txt wc -l $dict_dir/lexicon-en-iv.txt -pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'` -export PYTHONPATH=$PYTHONPATH:`pwd`/tools/g2p/lib/python${pyver}/site-packages -if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then - echo "--- Downloading Sequitur G2P ..." - echo "NOTE: it assumes that you have Python, NumPy and SWIG installed on your system!" - wget -P tools http://www-i6.informatik.rwth-aachen.de/web/Software/g2p-r1668.tar.gz - tar xf tools/g2p-r1668.tar.gz -C tools - cd tools/g2p - echo '#include ' >> Utility.hh # won't compile on my system w/o this "patch" - python setup.py build - python setup.py install --prefix=. - cd ../.. - if [ ! -f tools/g2p/lib/python${pyver}/site-packages/g2p.py ]; then - echo "Sequitur G2P is not found - installation failed?" - exit 1 - fi -fi if [ ! -f conf/g2p_model ]; then echo "--- Downloading a pre-trained Sequitur G2P model ..." @@ -82,8 +77,7 @@ if [ ! -f conf/g2p_model ]; then fi echo "--- Preparing pronunciations for OOV words ..." -python tools/g2p/lib/python${pyver}/site-packages/g2p.py \ - --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt +g2p.py --model=conf/g2p_model --apply $dict_dir/vocab-en-oov.txt > $dict_dir/lexicon-en-oov.txt cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\ sort > $dict_dir/lexicon-en-phn.txt @@ -91,25 +85,25 @@ cat $dict_dir/lexicon-en-oov.txt $dict_dir/lexicon-en-iv.txt |\ -# produce pronunciations for chinese +# produce pronunciations for chinese if [ ! -f $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt ]; then - wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz + wget -P $dict_dir http://www.mdbg.net/chindict/export/cedict/cedict_1_0_ts_utf-8_mdbg.txt.gz gunzip $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt.gz fi cat $dict_dir/cedict_1_0_ts_utf-8_mdbg.txt | grep -v '#' | awk -F '/' '{print $1}' |\ - perl -e ' + perl -e ' while () { @A = split(" ", $_); print $A[1]; for($n = 2; $n < @A; $n++) { - $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; - $tmp = uc($A[$n]); + $A[$n] =~ s:\[?([a-zA-Z0-9\:]+)\]?:$1:; + $tmp = uc($A[$n]); print " $tmp"; } print "\n"; } - ' | sort -k1 > $dict_dir/ch-dict.txt + ' | sort -k1 > $dict_dir/ch-dict.txt echo "--- Searching for Chinese OOV words ..." gawk 'NR==FNR{words[$1]; next;} !($1 in words)' \ @@ -120,22 +114,22 @@ gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $dict_dir/vocab-ch.txt $dict_dir/ch-dict.txt |\ egrep -v '<.?s>' > $dict_dir/lexicon-ch-iv.txt -wc -l $dict_dir/vocab-ch-oov.txt -wc -l $dict_dir/lexicon-ch-iv.txt +wc -l $dict_dir/vocab-ch-oov.txt || true +wc -l $dict_dir/lexicon-ch-iv.txt || true # this unset LC_ALL -# first make sure number of characters and pinyins -# are equal +# first make sure number of characters and pinyins +# are equal cat $dict_dir/ch-dict.txt |\ perl -e ' use encoding utf8; while () { @A = split(" ", $_); $word_len = length($A[0]); - $proun_len = @A - 1 ; + $proun_len = @A - 1 ; if ($word_len == $proun_len) {print $_;} } ' > $dict_dir/ch-dict-1.txt @@ -144,11 +138,12 @@ cat $dict_dir/ch-dict-1.txt | awk '{print $1}' | sed -e 's/\(\S\)/\1\n/g' | grep cat $dict_dir/ch-dict-1.txt | awk '{for(i=2; i<=NF; i++) print $i}' | sed -e 's/ /\n/g' > $dict_dir/ch-char-pinyin.txt wc -l $dict_dir/ch-char.txt wc -l $dict_dir/ch-char-pinyin.txt -paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt +paste $dict_dir/ch-char.txt $dict_dir/ch-char-pinyin.txt | sort -u > $dict_dir/ch-char-dict.txt + cat $dict_dir/ch-char-dict.txt |\ perl -e ' - my $prev = ""; + my $prev = ""; my $out_line = ""; while () { @A = split(" ", $_); @@ -157,16 +152,16 @@ cat $dict_dir/ch-char-dict.txt |\ #print length($prev); if (length($prev) == 0) { $out_line = $_; chomp($out_line);} if (length($prev)>0 && $cur ne $prev) { print $out_line; print "\n"; $out_line = $_; chomp($out_line);} - if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} + if (length($prev)>0 && $cur eq $prev) { $out_line = $out_line."/"."$cur_py";} $prev = $cur; } - print $out_line; - ' > $dict_dir/ch-char-dict-1.txt + print $out_line; + ' > $dict_dir/ch-char-dict-1.txt cat $dict_dir/vocab-ch-oov.txt | awk -v w=$dict_dir/ch-char-dict-1.txt \ - 'BEGIN{while((getline0) dict[$1]=$2;} + 'BEGIN{while((getline0) dict[$1]=$2;} {printf("%s", $1); for (i=1; i<=length($1); i++) { py=substr($1, i, 1); printf(" %s", dict[py]); } printf("\n"); }' \ - > $dict_dir/lexicon-ch-oov.txt + > $dict_dir/lexicon-ch-oov.txt cat $dict_dir/lexicon-ch-oov.txt |\ perl -e ' @@ -175,8 +170,8 @@ cat $dict_dir/lexicon-ch-oov.txt |\ while () { @A = split(" ", $_); @entry = (); - push(@entry, $A[0]); - for($i = 1; $i < @A; $i++ ) { + push(@entry, $A[0]); + for($i = 1; $i < @A; $i++ ) { @py = split("/", $A[$i]); @entry1 = @entry; @entry = (); @@ -184,29 +179,29 @@ cat $dict_dir/lexicon-ch-oov.txt |\ for ($k = 0; $k < @py; $k++) { $tmp = $entry1[$j]." ".$py[$k]; push(@entry, $tmp); - } - } + } + } } for ($i = 0; $i < @entry; $i++) { - print $entry[$i]; + print $entry[$i]; print "\n"; - } + } } ' > $dict_dir/lexicon-ch-oov1.txt cat $dict_dir/lexicon-ch-oov1.txt $dict_dir/lexicon-ch-iv.txt |\ - awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt + awk '{if (NF > 1) print $0;}' > $dict_dir/lexicon-ch.txt cat $dict_dir/lexicon-ch.txt | sed -e 's/U:/V/g' | sed -e 's/ R\([0-9]\)/ ER\1/g'|\ utils/pinyin_map.pl conf/pinyin2cmu > $dict_dir/lexicon-ch-cmu.txt -cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu +cat conf/cmu2pinyin | awk '{print $1;}' | sort -u > $dict_dir/cmu cat conf/pinyin2cmu | awk -v cmu=$dict_dir/cmu \ 'BEGIN{while((getline $dict_dir/cmu-used cat $dict_dir/cmu | awk -v cmu=$dict_dir/cmu-used \ 'BEGIN{while((getline $dict_dir/cmu-not-used + {if (!dict[$1]) print $1;}' > $dict_dir/cmu-not-used gawk 'NR==FNR{words[$1]; next;} ($1 in words)' \ $dict_dir/cmu-not-used conf/cmu2pinyin |\ @@ -229,9 +224,9 @@ cat $dict_dir/cmu-py | \ push(@entry, $W); for($i = 0; $i < @A; $i++) { push(@entry, @{$py2ph{$A[$i]}}); } print "@entry"; - print "\n"; - } -' conf/pinyin2cmu > $dict_dir/cmu-cmu + print "\n"; + } +' conf/pinyin2cmu > $dict_dir/cmu-cmu cat $dict_dir/lexicon-en-phn.txt | \ perl -e ' @@ -248,14 +243,14 @@ cat $dict_dir/lexicon-en-phn.txt | \ @entry = (); $W = shift(@A); push(@entry, $W); - for($i = 0; $i < @A; $i++) { + for($i = 0; $i < @A; $i++) { if (exists $py2ph{$A[$i]}) { push(@entry, @{$py2ph{$A[$i]}}); } else {push(@entry, $A[$i])}; } print "@entry"; - print "\n"; + print "\n"; } -' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt +' $dict_dir/cmu-cmu > $dict_dir/lexicon-en.txt cat $dict_dir/lexicon-en.txt $dict_dir/lexicon-ch-cmu.txt |\ sort -u > $dict_dir/lexicon1.txt @@ -267,8 +262,8 @@ cat $dict_dir/lexicon1.txt | awk '{ for(n=2;n<=NF;n++){ phones[$n] = 1; }} END{f while () { $phone = $_; chomp($phone); - chomp($_); - $phone =~ s:([A-Z]+)[0-9]:$1:; + chomp($_); + $phone =~ s:([A-Z]+)[0-9]:$1:; if (exists $ph_cl{$phone}) { push(@{$ph_cl{$phone}}, $_) } else { $ph_cl{$phone} = [$_]; } } @@ -298,7 +293,5 @@ cat $dict_dir/nonsilence_phones.txt | perl -e 'while(<>){ foreach $p (split(" ", export LC_ALL=C +echo "$0: Done" - - -exit 1; diff --git a/egs/gale_mandarin/s5/local/gale_train_lms.sh b/egs/gale_mandarin/s5/local/gale_train_lms.sh index af429ae2af1..b70bf8de564 100755 --- a/egs/gale_mandarin/s5/local/gale_train_lms.sh +++ b/egs/gale_mandarin/s5/local/gale_train_lms.sh @@ -4,13 +4,13 @@ # To be run from one directory above this script. -lexicon=data/local/dict/lexicon.txt +lexicon=data/local/dict/lexicon.txt [ ! -f $lexicon ] && echo "$0: No such file $lexicon" && exit 1; # check if sri is installed or no sri_installed=false which ngram-count &>/dev/null -if [[ $? == 0 ]]; then +if [[ $? == 0 ]]; then sri_installed=true fi @@ -23,9 +23,9 @@ fi export LC_ALL=C # You'll get errors about things being not sorted, if you # have a different locale. -export PATH=$PATH:./../../../tools/kaldi_lm +export PATH=$PATH:$KALDI_ROOT/tools/kaldi_lm ( # First make sure the kaldi_lm toolkit is installed. - cd ../../../tools || exit 1; + cd $KALDI_ROOT/tools || exit 1; if [ -d kaldi_lm ]; then echo Not installing the kaldi_lm toolkit since it is already there. else @@ -45,10 +45,10 @@ dir=data/local/lm mkdir -p $dir text=data/local/train/text [ ! -f $text ] && echo "$0: No such file $text" && exit 1; - + cleantext=$dir/text.no_oov - cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } + cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ",$n);} } printf("\n");}' \ > $cleantext || exit 1; @@ -72,20 +72,20 @@ dir=data/local/lm cat $cleantext | awk -v wmap=$dir/word_map 'BEGIN{while((getline0)map[$1]=$2;} { for(n=2;n<=NF;n++) { printf map[$n]; if(n$dir/train.gz \ || exit 1; - + train_lm.sh --arpa --lmtype 3gram-mincount $dir || exit 1; # LM is small enough that we don't need to prune it (only about 0.7M N-grams). # Perplexity over 128254.000000 words is 90.446690 # note: output is -# data/local/lm/3gram-mincount/lm_unpruned.gz +# data/local/lm/3gram-mincount/lm_unpruned.gz # From here is some commands to do a baseline with SRILM (assuming # you have it installed). -if $sri_installed; then +if $sri_installed; then heldout_sent=10000 # Don't change this if you want result to be comparable with # kaldi_lm results @@ -101,14 +101,14 @@ if $sri_installed; then ngram-count -text $sdir/train -order 3 -limit-vocab -vocab $sdir/wordlist -unk \ -map-unk "" -kndiscount -interpolate -lm $sdir/srilm.o3g.kn.gz - ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout + ngram -lm $sdir/srilm.o3g.kn.gz -ppl $sdir/heldout # 0 zeroprobs, logprob= -250954 ppl= 90.5091 ppl1= 132.482 # Note: perplexity SRILM gives to Kaldi-LM model is same as kaldi-lm reports above. # Difference in WSJ must have been due to different treatment of . - ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout + ngram -lm $dir/3gram-mincount/lm_unpruned.gz -ppl $sdir/heldout # 0 zeroprobs, logprob= -250913 ppl= 90.4439 ppl1= 132.379 fi -echo train lm succeeded \ No newline at end of file +echo train lm succeeded diff --git a/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh new file mode 100755 index 00000000000..b4a4de94a6d --- /dev/null +++ b/egs/gale_mandarin/s5/local/split_wer_per_corpus.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Report WER for reports and conversational +# Copyright 2014 QCRI (author: Ahmed Ali) +# Apache 2.0 + +if [ $# -ne 1 ]; then + echo "Arguments should be the gale folder, see ../run.sh for example." + exit 1; +fi + +[ -f ./path.sh ] && . ./path.sh + +set -o pipefail -e + +galeFolder=$(readlink -f $1) +symtab=./data/lang/words.txt + +min_lmwt=7 +max_lmwt=20 + +for dir in exp/*/*decode*; do + for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + #echo "Processing: $dir $type" + rm -fr $dir/scoring_$type + mkdir -p $dir/scoring_$type/log + for x in $dir/scoring/*.char $dir/scoring/*.tra $dir/scoring/char.filt $dir/scoring/text.filt; do + cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x) + done + + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \ + cat $dir/scoring_${type}/LMWT.tra \| \ + utils/int2sym.pl -f 2- $symtab \| sed 's:\::g' \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/text.filt ark,p:- ">&" $dir/wer_${type}_LMWT + utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.cer.LMWT.log \ + cat $dir/scoring_${type}/LMWT.char \| \ + compute-wer --text --mode=present \ + ark:$dir/scoring_${type}/char.filt ark,p:- ">&" $dir/cer_${type}_LMWT +done +done + +time=$(date +"%Y-%m-%d-%H-%M-%S") +echo "#RESULTS splits generated by $USER at $time" + +for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do + echo -e "\n# WER $type" + for x in exp/*/*decode*; do + grep WER $x/wer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + +for type in $(ls -1 local/test.* | xargs -n1 basename); do + echo -e "\n# CER $type" + for x in exp/*/*decode*; do + grep WER $x/cer_${type}_* | utils/best_wer.sh; + done | sort -n -k2 +done + + + diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S04 b/egs/gale_mandarin/s5/local/test.LDC2013S04 new file mode 100644 index 00000000000..60b3a95110d --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2013S04 @@ -0,0 +1,20 @@ +CCTV4_ACROSSSTRAIT_CMN_20070108_073033 +PHOENIX_NEWSLINE_CMN_20070101_114800 +CCTV4_TDYFOCUS_CMN_20070111_082801 +CCTV2_ECONOMYANDLAW_CMN_20070126_203005 +PHOENIX_BEHINDHL_CMN_20061004_052800 +PHOENIX_NEWSHACK_CMN_20060923_212301 +PHOENIX_NEWSLINE_CMN_20070102_114800 +PHOENIX_ASIANJRNL_CMN_20070313_075800 +PHOENIX_BEHINDHL_CMN_20061012_052800 +PHOENIX_NEWSLINE_CMN_20070105_114800 +CCTV4_TDYFOCUS_CMN_20061023_092800 +PHOENIX_SOCWATCH_CMN_20060928_225801 +PHOENIX_BEHINDHL_CMN_20061011_052800 +CCTVNEWS_TELLITLIKEITIS_CMN_20070114_140701 +CCTV4_TDYFOCUS_CMN_20070104_082800 +PHOENIX_NEWSLINE_CMN_20061020_114800 +PHOENIX_ASIANJRNL_CMN_20061002_085800 +PHOENIX_BEHINDHL_CMN_20070102_052800 +CCTV4_TDYFOCUS_CMN_20070108_082800 +PHOENIX_ASIANJRNL_CMN_20070111_075800 diff --git a/egs/gale_mandarin/s5/local/test.LDC2013S08 b/egs/gale_mandarin/s5/local/test.LDC2013S08 new file mode 100644 index 00000000000..6c0279412e9 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2013S08 @@ -0,0 +1,20 @@ +CCTV4_DAILYNEWS_CMN_20061023_135801 +CCTV4_DAILYNEWS_CMN_20060923_135800 +PHOENIX_PHNXWRLD_CMN_20070101_111800 +CCTV4_NEWS3_CMN_20060921_085800 +CCTV7_MILITARYNEWS1_CMN_20070102_193006 +PHOENIX_PHNXWRLD_CMN_20061024_112500 +CCTV7_MILITARYNEWS1_CMN_20070113_193011 +CCTV4_NEWS3_CMN_20061003_085800 +PHOENIX_PHNXWRLD_CMN_20061019_112401 +CCTV4_DAILYNEWS_CMN_20060920_135800 +PHOENIX_GOODMORNCN_CMN_20060926_185800 +ANHUI_NEWSREVIEW_CMN_20070103_175711 +CCTV4_DAILYNEWS_CMN_20060915_135800 +CCTV4_DAILYNEWS_CMN_20060924_135801 +PHOENIX_PHNXWRLD_CMN_20061018_112400 +CCTV7_MILITARYNEWS1_CMN_20070127_192932 +CCTVNEWS_EVENINGNEWS_CMN_20070123_225701 +CCTV4_NEWS3_CMN_20070116_075800 +PHOENIX_GOODMORNCN_CMN_20060918_185800 +PHOENIX_GOODMORNCN_CMN_20061009_185800 diff --git a/egs/gale_mandarin/s5/local/test.LDC2014S09 b/egs/gale_mandarin/s5/local/test.LDC2014S09 new file mode 100644 index 00000000000..ed871874636 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2014S09 @@ -0,0 +1,20 @@ +CCTV2_BUSINESSHOUR_CMN_20070428_070000 +CCTV1_LEGALREPORT_CMN_20070315_123701 +CCTV1_LEGALREPORT_CMN_20070418_123701 +CCTVNEWS_PEOPLESCONGRESS3_CMN_20070313_085702 +CCTV1_LEGALREPORT_CMN_20070426_123701 +CCTV4_ACROSSSTRAIT_CMN_20070430_073000 +HUBEI_COMMUNICATE_CMN_20070325_013001 +CCTVNEWS_PEOPLEINNEWS_CMN_20070327_215701 +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070315_040701 +CCTV1_LEGALREPORT_CMN_20070416_123701 +CCTV2_PEOPLESCONGRESS1_CMN_20070315_213000 +CCTV2_ECONOMYANDLAW_CMN_20070313_105916 +CCTV1_LEGALREPORT_CMN_20070430_123701 +HUBEI_COMMUNICATE_CMN_20070415_230013 +CCTV2_ECONOMYANDLAW_CMN_20070323_202800 +CCTV1_LEGALREPORT_CMN_20070312_123702 +CCTV1_LEGALREPORT_CMN_20070210_123701 +CCTV4_ACROSSSTRAIT_CMN_20070324_073000 +CCTV4_ACROSSSTRAIT_CMN_20070321_034001 +CCTV2_ECONOMYANDLAW_CMN_20070317_202900 diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S06 b/egs/gale_mandarin/s5/local/test.LDC2015S06 new file mode 100644 index 00000000000..dcdb97b1161 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2015S06 @@ -0,0 +1,14 @@ +CCTV1_LEGALREPORT_CMN_20070407_123702 +CCTV4_ACROSSSTRAIT_CMN_20070704_203000 +CCTV4_ACROSSSTRAIT_CMN_20070402_073000 +CCTV2_ECONOMYANDLAW_CMN_20070402_110000 +CCTV2_BUSINESSHOUR_CMN_20070829_220755 +CCTV1_LEGALREPORT_CMN_20070913_123702 +CCTV4_ACROSSSTRAIT_CMN_20070828_072923 +CCTV1_LEGALREPORT_CMN_20070826_123701 +CCTV4_ACROSSSTRAIT_CMN_20070715_203000 +CCTV4_ACROSSSTRAIT_CMN_20070404_202849 +CCTV2_DIALOG_CMN_20070707_090000 +CCTV1_LEGALREPORT_CMN_20070716_123701 +CCTV1_LEGALREPORT_CMN_20070408_123701 +CCTV4_ACROSSSTRAIT_CMN_20070712_203004 diff --git a/egs/gale_mandarin/s5/local/test.LDC2015S13 b/egs/gale_mandarin/s5/local/test.LDC2015S13 new file mode 100644 index 00000000000..ea52a7679af --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2015S13 @@ -0,0 +1,20 @@ +CCTV2_NEWSLIST_CMN_20070426_115000 +CCTV1_30MINNEWS_CMN_20070418_115702 +CCTV2_NEWSLIST_CMN_20070406_115000 +CCTV1_30MINNEWS_CMN_20070204_115701 +CCTVNEWS_EVENINGNEWS_CMN_20070315_225701 +CCTV1_30MINNEWS_CMN_20070417_115701 +CCTV1_30MINNEWS_CMN_20070208_115701 +CCTV4_NEWS3_CMN_20070327_075800 +CCTV7_MILITARYNEWS1_CMN_20070309_100451 +CCTV7_MILITARYNEWS1_CMN_20070310_093000 +CCTV7_MILITARYNEWS1_CMN_20070411_193000 +CCTV2_NEWSLIST_CMN_20070421_115000 +PHOENIX_PHNXWRLD_CMN_20070801_111801 +VOA_INTNLNEWS_CMN_20070927_210000 +PHOENIX_PHNXWRLD_CMN_20070326_111800 +PHOENIX_PHNXWRLD_CMN_20070821_111801 +CCTV1_30MINNEWS_CMN_20070307_115702 +CCTVNEWS_EVENINGNEWS_CMN_20070314_225701 +VOA_CURRENTEVENTS_CMN_20070807_220000 +CCTV1_30MINNEWS_CMN_20070207_115701 diff --git a/egs/gale_mandarin/s5/local/test.LDC2016S03 b/egs/gale_mandarin/s5/local/test.LDC2016S03 new file mode 100644 index 00000000000..73245ed4c29 --- /dev/null +++ b/egs/gale_mandarin/s5/local/test.LDC2016S03 @@ -0,0 +1,20 @@ +CCTVNEWS_PEOPLEINNEWS_CMN_20080325_202401 +PHOENIX_ASIANJRNL_CMN_20080725_085800 +VOA_LISTENERSHOTLINE_CMN_20080405_223000 +CCTV1_LEGALREPORT_CMN_20080329_123802 +CCTV2_DIALOG_CMN_20080323_220801 +CCTV2_ECONOMYANDLAW_CMN_20080312_202800 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080310_090602 +VOA_LISTENERSHOTLINE_CMN_20080402_223000 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080306_083701 +CCTV2_ECONOMYANDLAW_CMN_20080428_202802 +CCTV2_ECONOMYANDLAW_CMN_20080324_202802 +VOA_FOCUSDIALOGUE_CMN_20080412_210500 +CCTV4_ACROSSSTRAIT_CMN_20080416_073002 +VOA_STRAITSTALK_CMN_20080407_210500 +CCTVNEWS_PEOPLESCONGRESS2_CMN_20080308_083701 +CCTV1_LEGALREPORT_CMN_20080406_123801 +CCTV2_DIALOG_CMN_20080427_220801 +CCTV1_LEGALREPORT_CMN_20080411_123801 +CCTVNEWS_XIAOCUIINTERVIEW_CMN_20080306_122702 +CCTVNEWS_PEOPLEINNEWS_CMN_20080408_202701 diff --git a/egs/gale_mandarin/s5/path.sh b/egs/gale_mandarin/s5/path.sh index be11b34cbc6..e875e4b585c 100755 --- a/egs/gale_mandarin/s5/path.sh +++ b/egs/gale_mandarin/s5/path.sh @@ -1,5 +1,6 @@ export KALDI_ROOT=$(pwd)/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/kaldi_lm:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh +. $KALDI_ROOT/tools/env.sh export LC_ALL=C diff --git a/egs/gale_mandarin/s5/run.sh b/egs/gale_mandarin/s5/run.sh index 505ade6a269..f6c9f0828b7 100755 --- a/egs/gale_mandarin/s5/run.sh +++ b/egs/gale_mandarin/s5/run.sh @@ -6,31 +6,46 @@ . ./path.sh . ./cmd.sh -nJobs=40 -nDecodeJobs=40 - -AUDIO_PATH=/export/corpora5/LDC/LDC2013S08/ -TEXT_PATH=/export/corpora5/LDC/LDC2013T20/ - +nJobs=64 +nDecodeJobs=128 + +AUDIO=( + /scratch/groups/skhudan1/corpora/LDC2013S08/ + /scratch/groups/skhudan1/corpora/LDC2013S04/ + /scratch/groups/skhudan1/corpora/LDC2014S09/ + /scratch/groups/skhudan1/corpora/LDC2015S06/ + /scratch/groups/skhudan1/corpora/LDC2015S13/ + /scratch/groups/skhudan1/corpora/LDC2016S03/ +) +TEXT=( + /scratch/groups/skhudan1/corpora/LDC2013T20/ + /scratch/groups/skhudan1/corpora/LDC2013T08/ + /scratch/groups/skhudan1/corpora/LDC2014T28/ + /scratch/groups/skhudan1/corpora/LDC2015T09/ + /scratch/groups/skhudan1/corpora/LDC2015T25/ + /scratch/groups/skhudan1/corpora/LDC2016T12/ +) galeData=GALE/ # You can run the script from here automatically, but it is recommended to run the data preparation, # and features extraction manually and and only once. # By copying and pasting into the shell. -local/gale_data_prep_audio.sh $galeData $AUDIO_PATH - -local/gale_data_prep_txt.sh $galeData $TEXT_PATH +set -e -o pipefail +set -x + +local/gale_data_prep_audio.sh "${AUDIO[@]}" $galeData -local/gale_data_prep_split.sh $galeData +local/gale_data_prep_txt.sh "${TEXT[@]}" $galeData -local/gale_prep_dict.sh +local/gale_data_prep_split.sh $galeData +local/gale_prep_dict.sh -utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang +utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang local/gale_train_lms.sh -local/gale_format_data.sh +local/gale_format_data.sh # Now make MFCC features. # mfccdir should be some place with a largish disk where you @@ -38,6 +53,7 @@ local/gale_format_data.sh mfccdir=mfcc for x in train dev ; do + utils/fix_data_dir.sh data/$x steps/make_mfcc_pitch.sh --cmd "$train_cmd" --nj $nJobs \ data/$x exp/make_mfcc/$x $mfccdir utils/fix_data_dir.sh data/$x # some files fail to get mfcc for many reasons @@ -45,23 +61,25 @@ for x in train dev ; do done # Let's create a subset with 10k segments to make quick flat-start training: -utils/subset_data_dir.sh data/train 10000 data/train.10K || exit 1; +utils/subset_data_dir.sh data/train 10000 data/train.10k || exit 1; +utils/subset_data_dir.sh data/train 50000 data/train.50k || exit 1; +utils/subset_data_dir.sh data/train 100000 data/train.100k || exit 1; # Train monophone models on a subset of the data, 10K segment # Note: the --boost-silence option should probably be omitted by default steps/train_mono.sh --nj 40 --cmd "$train_cmd" \ - data/train.10K data/lang exp/mono || exit 1; + data/train.10k data/lang exp/mono || exit 1; # Get alignments from monophone system. steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ - data/train data/lang exp/mono exp/mono_ali || exit 1; + data/train.50k data/lang exp/mono exp/mono_ali.50k || exit 1; # train tri1 [first triphone pass] steps/train_deltas.sh --cmd "$train_cmd" \ - 2500 30000 data/train data/lang exp/mono_ali exp/tri1 || exit 1; + 2500 30000 data/train.50k data/lang exp/mono_ali.50k exp/tri1 || exit 1; # First triphone decoding -utils/mkgraph.sh data/lang_dev exp/tri1 exp/tri1/graph || exit 1; +utils/mkgraph.sh data/lang_test exp/tri1 exp/tri1/graph || exit 1; steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri1/graph data/dev exp/tri1/decode & @@ -73,14 +91,14 @@ steps/train_deltas.sh --cmd "$train_cmd" \ 3000 40000 data/train data/lang exp/tri1_ali exp/tri2a || exit 1; # tri2a decoding -utils/mkgraph.sh data/lang_dev exp/tri2a exp/tri2a/graph || exit 1; +utils/mkgraph.sh data/lang_test exp/tri2a exp/tri2a/graph || exit 1; steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri2a/graph data/dev exp/tri2a/decode & # train and decode tri2b [LDA+MLLT] steps/train_lda_mllt.sh --cmd "$train_cmd" 4000 50000 \ data/train data/lang exp/tri1_ali exp/tri2b || exit 1; -utils/mkgraph.sh data/lang_dev exp/tri2b exp/tri2b/graph || exit 1; +utils/mkgraph.sh data/lang_test exp/tri2b exp/tri2b/graph || exit 1; steps/decode.sh --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph data/dev exp/tri2b/decode & # Align all data with LDA+MLLT system (tri2b) @@ -90,9 +108,9 @@ steps/align_si.sh --nj $nJobs --cmd "$train_cmd" \ # Do MMI on top of LDA+MLLT. steps/make_denlats.sh --nj $nJobs --cmd "$train_cmd" \ data/train data/lang exp/tri2b exp/tri2b_denlats || exit 1; - + steps/train_mmi.sh data/train data/lang exp/tri2b_ali \ - exp/tri2b_denlats exp/tri2b_mmi + exp/tri2b_denlats exp/tri2b_mmi steps/decode.sh --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi/decode_it4 & @@ -100,10 +118,10 @@ steps/decode.sh --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi/decode_it3 & # Do the same with boosting. steps/train_mmi.sh --boost 0.1 data/train data/lang exp/tri2b_ali \ -exp/tri2b_denlats exp/tri2b_mmi_b0.1 +exp/tri2b_denlats exp/tri2b_mmi_b0.1 steps/decode.sh --iter 4 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ - data/dev exp/tri2b_mmi_b0.1/decode_it4 & + data/dev exp/tri2b_mmi_b0.1/decode_it4 & steps/decode.sh --iter 3 --nj $nJobs --cmd "$decode_cmd" exp/tri2b/graph \ data/dev exp/tri2b_mmi_b0.1/decode_it3 & @@ -119,7 +137,7 @@ steps/decode.sh --iter 3 --nj $nDecodeJobs --cmd "$decode_cmd" exp/tri2b/graph # From 2b system, train 3b which is LDA + MLLT + SAT. steps/train_sat.sh --cmd "$train_cmd" \ 5000 100000 data/train data/lang exp/tri2b_ali exp/tri3b || exit 1; -utils/mkgraph.sh data/lang_dev exp/tri3b exp/tri3b/graph|| exit 1; +utils/mkgraph.sh data/lang_test exp/tri3b exp/tri3b/graph|| exit 1; steps/decode_fmllr.sh --nj $nDecodeJobs --cmd "$decode_cmd" \ exp/tri3b/graph data/dev exp/tri3b/decode & @@ -130,12 +148,11 @@ steps/align_fmllr.sh --nj $nJobs --cmd "$train_cmd" \ ## SGMM (subspace gaussian mixture model), excluding the "speaker-dependent weights" steps/train_ubm.sh --cmd "$train_cmd" 700 \ data/train data/lang exp/tri3b_ali exp/ubm5a || exit 1; - + steps/train_sgmm2.sh --cmd "$train_cmd" 5000 20000 data/train data/lang exp/tri3b_ali \ exp/ubm5a/final.ubm exp/sgmm_5a || exit 1; -utils/mkgraph.sh data/lang_dev exp/sgmm_5a exp/sgmm_5a/graph || exit 1; - +utils/mkgraph.sh data/lang_test exp/sgmm_5a exp/sgmm_5a/graph || exit 1; steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \ --transform-dir exp/tri3b/decode exp/sgmm_5a/graph data/dev exp/sgmm_5a/decode & @@ -143,27 +160,30 @@ steps/align_sgmm2.sh --nj $nJobs --cmd "$train_cmd" --transform-dir exp/tri3b_al --use-graphs true --use-gselect true data/train data/lang exp/sgmm_5a exp/sgmm_5a_ali || exit 1; ## boosted MMI on SGMM -steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split 30 --beam 9.0 --lattice-beam 6 \ - --cmd "$decode_cmd" --transform-dir \ - exp/tri3b_ali data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1; - +steps/make_denlats_sgmm2.sh --nj $nJobs --sub-split $nJobs --beam 9.0 --lattice-beam 6 \ + --cmd "$decode_cmd" --num-threads 4 --transform-dir exp/tri3b_ali \ + data/train data/lang exp/sgmm_5a_ali exp/sgmm_5a_denlats || exit 1; + steps/train_mmi_sgmm2.sh --cmd "$train_cmd" --num-iters 8 --transform-dir exp/tri3b_ali --boost 0.1 \ data/train data/lang exp/sgmm_5a exp/sgmm_5a_denlats exp/sgmm_5a_mmi_b0.1 - + #decode GMM MMI -utils/mkgraph.sh data/lang_dev exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1; +utils/mkgraph.sh data/lang_test exp/sgmm_5a_mmi_b0.1 exp/sgmm_5a_mmi_b0.1/graph || exit 1; steps/decode_sgmm2.sh --nj $nDecodeJobs --cmd "$decode_cmd" --config conf/decode.config \ - --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode & - + --transform-dir exp/tri3b/decode exp/sgmm_5a_mmi_b0.1/graph data/dev exp/sgmm_5a_mmi_b0.1/decode + for n in 1 2 3 4; do - steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \ + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \ data/dev exp/sgmm_5a_mmi_b0.1/decode exp/sgmm_5a_mmi_b0.1/decode$n - - steps/decode_sgmm_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_dev \ +done + +for n in 1 2 3 4; do + steps/decode_sgmm2_rescore.sh --cmd "$decode_cmd" --iter $n --transform-dir exp/tri3b/decode data/lang_test \ data/dev exp/sgmm_5a/decode exp/sgmm_5a_mmi_onlyRescoreb0.1/decode$n done +wait local/nnet/run_dnn.sh time=$(date +"%Y-%m-%d-%H-%M-%S") diff --git a/egs/wsj/s5/utils/pinyin_map.pl b/egs/wsj/s5/utils/pinyin_map.pl index 65b260e2418..8210ec2af51 100755 --- a/egs/wsj/s5/utils/pinyin_map.pl +++ b/egs/wsj/s5/utils/pinyin_map.pl @@ -10,7 +10,7 @@ open(MAPS, $ARGV[0]) or die("Could not open pinyin map file."); my %py2ph; foreach $line () { @A = split(" ", $line); $py = shift(@A); - $py2ph{$py} = [@A]; + $py2ph{$py} = [@A]; } #foreach $word ( keys %py2ph ) { @@ -25,14 +25,14 @@ while () { @A = split(" ", $_); - @entry = (); + @entry = (); $W = shift(@A); push(@entry, $W); for($i = 0; $i < @A; $i++) { $initial= $A[$i]; $final = $A[$i]; #print $initial, " ", $final, "\n"; if ($A[$i] =~ /^CH[A-Z0-9]+$/) {$initial =~ s:(CH)[A-Z0-9]+:$1:; $final =~ s:CH([A-Z0-9]+):$1:;} - elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} + elsif ($A[$i] =~ /^SH[A-Z0-9]+$/) {$initial =~ s:(SH)[A-Z0-9]+:$1:; $final =~ s:SH([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^ZH[A-Z0-9]+$/) {$initial =~ s:(ZH)[A-Z0-9]+:$1:; $final =~ s:ZH([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^B[A-Z0-9]+$/) {$initial =~ s:(B)[A-Z0-9]+:$1:; $final =~ s:B([A-Z0-9]+):$1:;} elsif ($A[$i] =~ /^C[A-Z0-9]+$/) {$initial =~ s:(C)[A-Z0-9]+:$1:; $final =~ s:C([A-Z0-9]+):$1:;} @@ -58,22 +58,22 @@ $tone = $final; $final =~ s:([A-Z]+)[0-9]:$1:; $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { print "1: no entry find for ", $A[$i], " ", $initial, " ", $final; exit;} - push(@entry, @{$py2ph{$initial}}); + if (!(exists $py2ph{$initial}) or !(exists $py2ph{$final})) { die "$0: no entry find for ", $A[$i], " ", $initial, " ", $final;} + push(@entry, @{$py2ph{$initial}}); @tmp = @{$py2ph{$final}}; for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); + push(@entry, @tmp); } else { $tone = $A[$i]; - $A[$i] =~ s:([A-Z]+)[0-9]:$1:; + $A[$i] =~ s:([A-Z]+)[0-9]:$1:; $tone =~ s:[A-Z]+([0-9]):$1:; - if (!(exists $py2ph{$A[$i]})) { print "2: no entry find for ", $A[$i]; exit;} + if (!(exists $py2ph{$A[$i]})) { die "$0: no entry find for ", $A[$i];} @tmp = @{$py2ph{$A[$i]}}; for($j = 0; $j < @tmp ; $j++) {$tmp[$j] = $tmp[$j].$tone;} - push(@entry, @tmp); + push(@entry, @tmp); } - } + } print "@entry"; print "\n"; }