Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions egs/gale_arabic/s5/local/split_wer_per_corpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Report WER for reports and conversational
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0

if [ $# -ne 1 ]; then
echo "Arguments should be the gale folder, see ../run.sh for example."
exit 1;
fi

[ -f ./path.sh ] && . ./path.sh

#set -o pipefail -e

galeFolder=$(readlink -f $1)
symtab=./data/lang/words.txt

min_lmwt=7
max_lmwt=20

for dir in exp/*/*decode*; do
for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
#echo "Processing: $dir $type"
rm -fr $dir/scoring_$type
mkdir -p $dir/scoring_$type/log
for x in $dir/scoring/*.tra $dir/scoring/test_filt.txt; do
cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
done

utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
cat $dir/scoring_${type}/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT
done
done

time=$(date +"%Y-%m-%d-%H-%M-%S")
echo "#RESULTS splits generated by $USER at $time"

for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
echo -e "\n# WER $type"
for x in exp/*/*decode*; do
grep WER $x/wer_${type}_* | utils/best_wer.sh;
done | sort -n -k2
done




97 changes: 97 additions & 0 deletions egs/gale_mandarin/s5/RESULTS
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#RESULTS splits generated by jtrmal1@jhu.edu at 2016-11-21-12-05-54

# WER test.LDC2013S04
%WER 42.23 [ 40179 / 95137, 5329 ins, 8769 del, 26081 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S04_10
%WER 43.81 [ 41682 / 95137, 5469 ins, 9213 del, 27000 sub ] exp/tri3b/decode/wer_test.LDC2013S04_13
%WER 49.06 [ 46677 / 95137, 5459 ins, 10672 del, 30546 sub ] exp/tri2b/decode/wer_test.LDC2013S04_13
%WER 50.53 [ 48073 / 95137, 5505 ins, 11022 del, 31546 sub ] exp/tri3b/decode.si/wer_test.LDC2013S04_12
%WER 51.47 [ 48971 / 95137, 5103 ins, 12391 del, 31477 sub ] exp/tri2a/decode/wer_test.LDC2013S04_13
%WER 53.30 [ 50708 / 95137, 4829 ins, 13624 del, 32255 sub ] exp/tri1/decode/wer_test.LDC2013S04_13

# WER test.LDC2013S08
%WER 26.01 [ 20781 / 79911, 3764 ins, 3034 del, 13983 sub ] exp/sgmm_5a/decode/wer_test.LDC2013S08_8
%WER 27.43 [ 21917 / 79911, 3644 ins, 3544 del, 14729 sub ] exp/tri3b/decode/wer_test.LDC2013S08_13
%WER 31.24 [ 24968 / 79911, 3820 ins, 3943 del, 17205 sub ] exp/tri2b/decode/wer_test.LDC2013S08_12
%WER 32.45 [ 25932 / 79911, 3816 ins, 4112 del, 18004 sub ] exp/tri3b/decode.si/wer_test.LDC2013S08_11
%WER 34.22 [ 27349 / 79911, 3677 ins, 5034 del, 18638 sub ] exp/tri2a/decode/wer_test.LDC2013S08_13
%WER 35.88 [ 28676 / 79911, 3715 ins, 5127 del, 19834 sub ] exp/tri1/decode/wer_test.LDC2013S08_12

# WER test.LDC2014S09
%WER 50.54 [ 39383 / 77932, 10535 ins, 7593 del, 21255 sub ] exp/sgmm_5a/decode/wer_test.LDC2014S09_12
%WER 52.14 [ 40634 / 77932, 10271 ins, 8530 del, 21833 sub ] exp/tri3b/decode/wer_test.LDC2014S09_17
%WER 56.57 [ 44085 / 77932, 9394 ins, 10954 del, 23737 sub ] exp/tri2b/decode/wer_test.LDC2014S09_16
%WER 57.95 [ 45158 / 77932, 8777 ins, 12547 del, 23834 sub ] exp/tri2a/decode/wer_test.LDC2014S09_15
%WER 58.19 [ 45347 / 77932, 9712 ins, 10831 del, 24804 sub ] exp/tri3b/decode.si/wer_test.LDC2014S09_15
%WER 59.38 [ 46277 / 77932, 7944 ins, 14560 del, 23773 sub ] exp/tri1/decode/wer_test.LDC2014S09_16

# WER test.LDC2015S06
%WER 46.22 [ 28480 / 61612, 8454 ins, 5015 del, 15011 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S06_9
%WER 48.08 [ 29624 / 61612, 8471 ins, 5669 del, 15484 sub ] exp/tri3b/decode/wer_test.LDC2015S06_13
%WER 52.67 [ 32450 / 61612, 8425 ins, 6441 del, 17584 sub ] exp/tri2b/decode/wer_test.LDC2015S06_12
%WER 53.51 [ 32968 / 61612, 8444 ins, 6576 del, 17948 sub ] exp/tri3b/decode.si/wer_test.LDC2015S06_11
%WER 55.08 [ 33936 / 61612, 8031 ins, 7811 del, 18094 sub ] exp/tri2a/decode/wer_test.LDC2015S06_13
%WER 56.70 [ 34937 / 61612, 7890 ins, 8531 del, 18516 sub ] exp/tri1/decode/wer_test.LDC2015S06_13

# WER test.LDC2015S13
%WER 23.35 [ 19752 / 84594, 2196 ins, 3274 del, 14282 sub ] exp/sgmm_5a/decode/wer_test.LDC2015S13_9
%WER 24.81 [ 20984 / 84594, 2214 ins, 3600 del, 15170 sub ] exp/tri3b/decode/wer_test.LDC2015S13_12
%WER 28.62 [ 24211 / 84594, 2306 ins, 4186 del, 17719 sub ] exp/tri2b/decode/wer_test.LDC2015S13_12
%WER 30.03 [ 25405 / 84594, 2106 ins, 4617 del, 18682 sub ] exp/tri3b/decode.si/wer_test.LDC2015S13_12
%WER 30.58 [ 25869 / 84594, 2142 ins, 4798 del, 18929 sub ] exp/tri2a/decode/wer_test.LDC2015S13_12
%WER 32.16 [ 27206 / 84594, 1958 ins, 5681 del, 19567 sub ] exp/tri1/decode/wer_test.LDC2015S13_13

# WER test.LDC2016S03
%WER 53.04 [ 77015 / 145212, 34385 ins, 9733 del, 32897 sub ] exp/sgmm_5a/decode/wer_test.LDC2016S03_12
%WER 54.68 [ 79399 / 145212, 34634 ins, 10414 del, 34351 sub ] exp/tri3b/decode/wer_test.LDC2016S03_17
%WER 58.99 [ 85661 / 145212, 33946 ins, 12904 del, 38811 sub ] exp/tri2b/decode/wer_test.LDC2016S03_16
%WER 59.80 [ 86841 / 145212, 34387 ins, 12610 del, 39844 sub ] exp/tri3b/decode.si/wer_test.LDC2016S03_15
%WER 60.29 [ 87547 / 145212, 31358 ins, 15266 del, 40923 sub ] exp/tri2a/decode/wer_test.LDC2016S03_16
%WER 61.75 [ 89662 / 145212, 30628 ins, 16992 del, 42042 sub ] exp/tri1/decode/wer_test.LDC2016S03_16

# CER test.LDC2013S04
%WER 33.93 [ 51673 / 152279, 7241 ins, 12180 del, 32252 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S04_10
%WER 35.31 [ 53769 / 152279, 7813 ins, 11593 del, 34363 sub ] exp/tri3b/decode/cer_test.LDC2013S04_11
%WER 40.56 [ 61767 / 152279, 8062 ins, 13321 del, 40384 sub ] exp/tri2b/decode/cer_test.LDC2013S04_11
%WER 42.08 [ 64081 / 152279, 8052 ins, 13940 del, 42089 sub ] exp/tri3b/decode.si/cer_test.LDC2013S04_10
%WER 43.22 [ 65818 / 152279, 7602 ins, 15416 del, 42800 sub ] exp/tri2a/decode/cer_test.LDC2013S04_11
%WER 44.93 [ 68413 / 152279, 7255 ins, 16855 del, 44303 sub ] exp/tri1/decode/cer_test.LDC2013S04_11

# CER test.LDC2013S08
%WER 19.18 [ 25398 / 132434, 4773 ins, 3650 del, 16975 sub ] exp/sgmm_5a/decode/cer_test.LDC2013S08_8
%WER 20.54 [ 27201 / 132434, 4792 ins, 4037 del, 18372 sub ] exp/tri3b/decode/cer_test.LDC2013S08_11
%WER 24.12 [ 31943 / 132434, 4817 ins, 4968 del, 22158 sub ] exp/tri2b/decode/cer_test.LDC2013S08_12
%WER 25.15 [ 33309 / 132434, 4839 ins, 5019 del, 23451 sub ] exp/tri3b/decode.si/cer_test.LDC2013S08_11
%WER 26.90 [ 35623 / 132434, 4725 ins, 6057 del, 24841 sub ] exp/tri2a/decode/cer_test.LDC2013S08_12
%WER 28.45 [ 37674 / 132434, 4506 ins, 6690 del, 26478 sub ] exp/tri1/decode/cer_test.LDC2013S08_12

# CER test.LDC2014S09
%WER 42.24 [ 53240 / 126027, 16007 ins, 10270 del, 26963 sub ] exp/sgmm_5a/decode/cer_test.LDC2014S09_11
%WER 43.81 [ 55212 / 126027, 15435 ins, 11971 del, 27806 sub ] exp/tri3b/decode/cer_test.LDC2014S09_15
%WER 48.72 [ 61395 / 126027, 14667 ins, 15066 del, 31662 sub ] exp/tri2b/decode/cer_test.LDC2014S09_14
%WER 50.20 [ 63270 / 126027, 15105 ins, 14701 del, 33464 sub ] exp/tri3b/decode.si/cer_test.LDC2014S09_13
%WER 50.37 [ 63481 / 126027, 13343 ins, 18289 del, 31849 sub ] exp/tri2a/decode/cer_test.LDC2014S09_14
%WER 51.95 [ 65470 / 126027, 12613 ins, 20231 del, 32626 sub ] exp/tri1/decode/cer_test.LDC2014S09_14

# CER test.LDC2015S06
%WER 38.57 [ 38234 / 99132, 12510 ins, 7120 del, 18604 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S06_9
%WER 40.30 [ 39954 / 99132, 12593 ins, 7986 del, 19375 sub ] exp/tri3b/decode/cer_test.LDC2015S06_12
%WER 44.83 [ 44438 / 99132, 12639 ins, 8903 del, 22896 sub ] exp/tri2b/decode/cer_test.LDC2015S06_11
%WER 45.71 [ 45318 / 99132, 12631 ins, 9164 del, 23523 sub ] exp/tri3b/decode.si/cer_test.LDC2015S06_10
%WER 47.39 [ 46983 / 99132, 12432 ins, 9935 del, 24616 sub ] exp/tri2a/decode/cer_test.LDC2015S06_11
%WER 49.03 [ 48600 / 99132, 12250 ins, 10831 del, 25519 sub ] exp/tri1/decode/cer_test.LDC2015S06_11

# CER test.LDC2015S13
%WER 17.05 [ 23993 / 140702, 2450 ins, 3594 del, 17949 sub ] exp/sgmm_5a/decode/cer_test.LDC2015S13_8
%WER 18.39 [ 25872 / 140702, 2257 ins, 4274 del, 19341 sub ] exp/tri3b/decode/cer_test.LDC2015S13_11
%WER 21.98 [ 30933 / 140702, 2347 ins, 4784 del, 23802 sub ] exp/tri2b/decode/cer_test.LDC2015S13_11
%WER 23.23 [ 32679 / 140702, 2197 ins, 5383 del, 25099 sub ] exp/tri3b/decode.si/cer_test.LDC2015S13_11
%WER 23.88 [ 33596 / 140702, 2030 ins, 6225 del, 25341 sub ] exp/tri2a/decode/cer_test.LDC2015S13_12
%WER 25.47 [ 35842 / 140702, 1944 ins, 6979 del, 26919 sub ] exp/tri1/decode/cer_test.LDC2015S13_12

# CER test.LDC2016S03
%WER 45.40 [ 106787 / 235216, 53964 ins, 12519 del, 40304 sub ] exp/sgmm_5a/decode/cer_test.LDC2016S03_11
%WER 46.75 [ 109953 / 235216, 54007 ins, 13639 del, 42307 sub ] exp/tri3b/decode/cer_test.LDC2016S03_15
%WER 51.08 [ 120139 / 235216, 53593 ins, 16514 del, 50032 sub ] exp/tri2b/decode/cer_test.LDC2016S03_14
%WER 51.97 [ 122235 / 235216, 52763 ins, 17940 del, 51532 sub ] exp/tri3b/decode.si/cer_test.LDC2016S03_15
%WER 52.61 [ 123739 / 235216, 47836 ins, 22637 del, 53266 sub ] exp/tri2a/decode/cer_test.LDC2016S03_16
%WER 54.06 [ 127163 / 235216, 47776 ins, 23865 del, 55522 sub ] exp/tri1/decode/cer_test.LDC2016S03_15
Empty file.
12 changes: 12 additions & 0 deletions egs/gale_mandarin/s5/local/bad_utts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070308_040701
CCTV2_ECONOMYANDLAW_CMN_20070426_202800
CCTV2_ECONOMYANDLAW_CMN_20070426_202800(1)
CCTV2_LIANGHUI_PROBLEM_20070308_213000
CCTV4_TDYFOCUS_CMN_20070824_092801
VOA_ISSUESANDOPINIONS_CMN_20070801_210500
VOA_ISSUESANDOPINIONS_CMN_20070926_210500
VOA_LISTENERSHOTLINE_CMN_20070906_223000
VOA_LISTENERSHOTLINE_CMN_20070926_223000
VOA_LISTENERSHOTLINE_CMN_20070927_223000
PHOENIX_NEWSLINE_CMN_20070101_114800
PHOENIX_NEWSLINE_CMN_20070101_114800(1)
79 changes: 51 additions & 28 deletions egs/gale_mandarin/s5/local/gale_data_prep_audio.sh
Original file line number Diff line number Diff line change
@@ -1,46 +1,69 @@
#!/bin/bash
#!/bin/bash

# Copyright 2014 QCRI (author: Ahmed Ali)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0


if [ $# -ne 2 ]; then
echo "Arguments should be the <output folder> <data folder> "; exit 1
fi
echo $0 "$@"

galeData=$(readlink -f "${@: -1}" );
wavedir=$galeData/wav
mkdir -p $wavedir


# check that sox is installed
length=$(($#-1))
args=${@:1:$length}

# check that sox is installed
which sox &>/dev/null
if [[ $? != 0 ]]; then
echo "sox is not installed"
exit 1
if [[ $? != 0 ]]; then
echo "$0: sox is not installed"
exit 1
fi

galeData=$1
wavedir=$galeData/wav
mkdir -p $wavedir
set -e -o pipefail

for var in $args; do
CD=$(basename $var)
[ -d $wavedir/$CD ] && rm -rf $wavedir/$CD
mkdir -p $wavedir/$CD
find $var -type f -name *.wav | while read file; do
f=$(basename $file)
if [[ ! -L "$wavedir/$CD/$f" ]]; then
ln -sf $file $wavedir/$CD/$f
fi
done

audio_path=$2

mkdir -p $wavedir/

#copy and convert the flac to wav
find $audio_path -type f -name *.flac | while read file; do
f_name=$(basename $file)
if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then
echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" "
sox $file $wavedir/"${f_name%.flac}.wav"
fi

#make an flac symmlink as well
find $var -type f -name *.flac | while read file; do
f=$(basename $file)

if [[ ! -L "$wavedir/$CD/$f" ]]; then
ln -sf $file $wavedir/$CD/$f
fi
done
done

find $wavedir -name *.wav > $galeData/wav$$
awk -F "/" '{print $NF}' $galeData/wav$$ | sed 's:\.wav::' > $galeData/id$$
paste -d ' ' $galeData/id$$ $galeData/wav$$ | sort -u > $galeData/wav.scp
#figure out the proper sox command line
#the flac will be converted on the fly
(
for w in `find $wavedir -name *.wav` ; do
base=`basename $w .wav`
fullpath=`readlink -f $w`
echo "$base sox $fullpath -r 16000 -t wav - |"
done

for w in `find $wavedir -name *.flac` ; do
base=`basename $w .flac`
fullpath=`readlink -f $w`
echo "$base sox $fullpath -r 16000 -t wav - |"
done
) | sort -u > $galeData/wav.scp

#clean
#clean
rm -fr $galeData/id$$ $galeData/wav$$
echo data prep audio succeded
echo "$0: data prep audio succeded"

exit 0

33 changes: 15 additions & 18 deletions egs/gale_mandarin/s5/local/gale_data_prep_split.sh
Original file line number Diff line number Diff line change
@@ -1,37 +1,33 @@
#!/bin/bash
#!/bin/bash

# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0

if [ $# -ne 1 ]; then
echo "Arguments should be the <gale folder>"; exit 1
fi

set -e -o pipefail
#data will data/local

galeData=$(readlink -f $1)
mkdir -p data/local
dir=$(readlink -f data/local)

cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist

cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled

# we want about 6h dev data; 300 is manually chosen
cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev


cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list

# some problem with the text data; same utt id but different transcription
cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
cat $galeData/all | awk '{print$2}' | \
sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list

utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup
utils/filter_scp.pl --exclude -f 2 \
$galeData/dup.list $galeData/all > $galeData/all.nodup

mv $galeData/all_nodup $galeData/all
mv $galeData/all $galeData/all.orig
mv $galeData/all.nodup $galeData/all

utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev
utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train
grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev
grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train

cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list
cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list
Expand All @@ -46,11 +42,11 @@ utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt

for x in dev train; do
outdir=$dir/$x
file=$galeData/all.$x
file=$galeData/all.$x
mkdir -p $outdir
awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
done
done

cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list
cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list
Expand All @@ -60,5 +56,6 @@ utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train

cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
{if (seen[$1]) { print $0}}' > $dir/train/wav.scp



echo data prep split succeeded
Loading