Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions egs/gale_arabic/s5/local/split_wer_per_corpus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Report WER for reports and conversational
# Copyright 2014 QCRI (author: Ahmed Ali)
# Apache 2.0

if [ $# -ne 1 ]; then
echo "Arguments should be the gale folder, see ../run.sh for example."
exit 1;
fi

[ -f ./path.sh ] && . ./path.sh

#set -o pipefail -e

galeFolder=$(readlink -f $1)
symtab=./data/lang/words.txt

min_lmwt=7
max_lmwt=20

for dir in exp/*/*decode*; do
for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
#echo "Processing: $dir $type"
rm -fr $dir/scoring_$type
mkdir -p $dir/scoring_$type/log
for x in $dir/scoring/*.tra $dir/scoring/test_filt.txt; do
cat $x | grep -f local/$type > $dir/scoring_$type/$(basename $x)
done

utils/run.pl LMWT=$min_lmwt:$max_lmwt $dir/scoring_$type/log/score.LMWT.log \
cat $dir/scoring_${type}/LMWT.tra \| \
utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \
compute-wer --text --mode=present \
ark:$dir/scoring_${type}/test_filt.txt ark,p:- ">&" $dir/wer_${type}_LMWT
done
done

time=$(date +"%Y-%m-%d-%H-%M-%S")
echo "#RESULTS splits generated by $USER at $time"

for type in $(ls -1 local/test_list local/test.* | xargs -n1 basename); do
echo -e "\n# WER $type"
for x in exp/*/*decode*; do
grep WER $x/wer_${type}_* | utils/best_wer.sh;
done | sort -n -k2
done




266 changes: 266 additions & 0 deletions egs/gale_mandarin/s5/RESULTS

Large diffs are not rendered by default.

Empty file.
12 changes: 12 additions & 0 deletions egs/gale_mandarin/s5/local/bad_utts
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
CCTVNEWS_XIAOCUIINTERVIEW_CMN_20070308_040701
CCTV2_ECONOMYANDLAW_CMN_20070426_202800
CCTV2_ECONOMYANDLAW_CMN_20070426_202800(1)
CCTV2_LIANGHUI_PROBLEM_20070308_213000
CCTV4_TDYFOCUS_CMN_20070824_092801
VOA_ISSUESANDOPINIONS_CMN_20070801_210500
VOA_ISSUESANDOPINIONS_CMN_20070926_210500
VOA_LISTENERSHOTLINE_CMN_20070906_223000
VOA_LISTENERSHOTLINE_CMN_20070926_223000
VOA_LISTENERSHOTLINE_CMN_20070927_223000
PHOENIX_NEWSLINE_CMN_20070101_114800
PHOENIX_NEWSLINE_CMN_20070101_114800(1)
79 changes: 51 additions & 28 deletions egs/gale_mandarin/s5/local/gale_data_prep_audio.sh
Original file line number Diff line number Diff line change
@@ -1,46 +1,69 @@
#!/bin/bash
#!/bin/bash

# Copyright 2014 QCRI (author: Ahmed Ali)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0


if [ $# -ne 2 ]; then
echo "Arguments should be the <output folder> <data folder> "; exit 1
fi
echo $0 "$@"

galeData=$(readlink -f "${@: -1}" );
wavedir=$galeData/wav
mkdir -p $wavedir


# check that sox is installed
length=$(($#-1))
args=${@:1:$length}

# check that sox is installed
which sox &>/dev/null
if [[ $? != 0 ]]; then
echo "sox is not installed"
exit 1
if [[ $? != 0 ]]; then
echo "$0: sox is not installed"
exit 1
fi

galeData=$1
wavedir=$galeData/wav
mkdir -p $wavedir
set -e -o pipefail

for var in $args; do
CD=$(basename $var)
[ -d $wavedir/$CD ] && rm -rf $wavedir/$CD
mkdir -p $wavedir/$CD
find $var -type f -name *.wav | while read file; do
f=$(basename $file)
if [[ ! -L "$wavedir/$CD/$f" ]]; then
ln -sf $file $wavedir/$CD/$f
fi
done

audio_path=$2

mkdir -p $wavedir/

#copy and convert the flac to wav
find $audio_path -type f -name *.flac | while read file; do
f_name=$(basename $file)
if [[ ! -e $wavedir/"${f_name%.flac}.wav" ]]; then
echo "soxing $file to $wavedir/$CD/"${f_name%.flac}.wav" "
sox $file $wavedir/"${f_name%.flac}.wav"
fi

#make an flac symmlink as well
find $var -type f -name *.flac | while read file; do
f=$(basename $file)

if [[ ! -L "$wavedir/$CD/$f" ]]; then
ln -sf $file $wavedir/$CD/$f
fi
done
done

find $wavedir -name *.wav > $galeData/wav$$
awk -F "/" '{print $NF}' $galeData/wav$$ | sed 's:\.wav::' > $galeData/id$$
paste -d ' ' $galeData/id$$ $galeData/wav$$ | sort -u > $galeData/wav.scp
#figure out the proper sox command line
#the flac will be converted on the fly
(
for w in `find $wavedir -name *.wav` ; do
base=`basename $w .wav`
fullpath=`readlink -f $w`
echo "$base sox $fullpath -r 16000 -t wav - |"
done

for w in `find $wavedir -name *.flac` ; do
base=`basename $w .flac`
fullpath=`readlink -f $w`
echo "$base sox $fullpath -r 16000 -t wav - |"
done
) | sort -u > $galeData/wav.scp

#clean
#clean
rm -fr $galeData/id$$ $galeData/wav$$
echo data prep audio succeded
echo "$0: data prep audio succeded"

exit 0

33 changes: 15 additions & 18 deletions egs/gale_mandarin/s5/local/gale_data_prep_split.sh
Original file line number Diff line number Diff line change
@@ -1,37 +1,33 @@
#!/bin/bash
#!/bin/bash

# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0

if [ $# -ne 1 ]; then
echo "Arguments should be the <gale folder>"; exit 1
fi

set -e -o pipefail
#data will data/local

galeData=$(readlink -f $1)
mkdir -p data/local
dir=$(readlink -f data/local)

cat $galeData/utt2spk | awk '{print$2}' | sort -u > $galeData/spklist

cat $galeData/spklist | utils/shuffle_list.pl --srand ${seed:-777} > $galeData/spklist.shuffled

# we want about 6h dev data; 300 is manually chosen
cat $galeData/spklist.shuffled | head -n 300 > $galeData/spklist.dev


cat $galeData/utt2spk | grep -f $galeData/spklist.dev | awk '{print$1}' > $galeData/dev.list

# some problem with the text data; same utt id but different transcription
cat $galeData/all | awk '{print$2}' | sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list
cat $galeData/all | awk '{print$2}' | \
sort | uniq -c | awk '{if($1!="1")print$2}' > $galeData/dup.list

utils/filter_scp.pl --exclude -f 2 $galeData/dup.list $galeData/all > $galeData/all_nodup
utils/filter_scp.pl --exclude -f 2 \
$galeData/dup.list $galeData/all > $galeData/all.nodup

mv $galeData/all_nodup $galeData/all
mv $galeData/all $galeData/all.orig
mv $galeData/all.nodup $galeData/all

utils/filter_scp.pl -f 2 $galeData/dev.list $galeData/all > $galeData/all.dev
utils/filter_scp.pl --exclude -f 2 $galeData/dev.list $galeData/all > $galeData/all.train
grep -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.dev
grep -v -f <(cat local/test.LDC*) $galeData/all | grep -v -F -f local/bad_utts > $galeData/all.train

cat $galeData/all.dev | awk '{print$2}' > $galeData/dev_utt_list
cat $galeData/all.train | awk '{print$2}' > $galeData/train_utt_list
Expand All @@ -46,11 +42,11 @@ utils/utt2spk_to_spk2utt.pl $dir/train/utt2spk | sort -u > $dir/train/spk2utt

for x in dev train; do
outdir=$dir/$x
file=$galeData/all.$x
file=$galeData/all.$x
mkdir -p $outdir
awk '{print $2 " " $1 " " $3 " " $4}' $file | sort -u > $outdir/segments
awk '{printf $2 " "; for (i=5; i<=NF; i++) {printf $i " "} printf "\n"}' $file | sort -u > $outdir/text
done
done

cat $dir/dev/segments | awk '{print$2}' | sort -u > $galeData/dev.wav.list
cat $dir/train/segments | awk '{print$2}' | sort -u > $galeData/train.wav.list
Expand All @@ -60,5 +56,6 @@ utils/filter_scp.pl -f 1 $galeData/train.wav.list $galeData/wav.scp > $dir/train

cat $galeData/wav.scp | awk -v seg=$dir/train/segments 'BEGIN{while((getline<seg) >0) {seen[$2]=1;}}
{if (seen[$1]) { print $0}}' > $dir/train/wav.scp



echo data prep split succeeded
74 changes: 47 additions & 27 deletions egs/gale_mandarin/s5/local/gale_data_prep_txt.sh
Original file line number Diff line number Diff line change
@@ -1,28 +1,38 @@
#!/bin/bash

# Copyright 2014 (author: Ahmed Ali, Hainan Xu)
# Copyright 2016 Johns Hopkins Univeersity (author: Jan "Yenda" Trmal)
# Apache 2.0

if [ $# -ne 2 ]; then
echo "Arguments should be the <gale folder> <txt data folder>"; exit 1
fi

echo $0 "$@"
export LC_ALL=C

galeData=$1
text=$2
galeData=$(readlink -f "${@: -1}" );

cur=`pwd`
length=$(($#-1))
args=${@:1:$length}

txtdir=$galeData/txt
mkdir -p $galeData/txt
top_pwd=`pwd`
txtdir=$galeData/txt
mkdir -p $txtdir

cd $txtdir

find $text -type f -name *.tdf | while read file; do
sed '1,3d' $file
done > all.tmp
for cdx in ${args[@]}; do
echo "Preparing $cdx"
if [[ $cdx == *.tgz ]] ; then
tar -xvf $cdx
elif [ -d "$cdx" ]; then
tgt=$(basename $cdx)
test -x $tgt || ln -s $cdx `basename $tgt`
else
echo "I don't really know what I shall do with $cdx " >&2
fi
done

find -L . -type f -name *.tdf | while read file; do
sed '1,3d' $file
done > all.tmp

perl -e '
($inFile,$idFile,$txtFile,$spk,$mapf)= split /\s+/, $ARGV[0];
Expand All @@ -34,34 +44,47 @@ perl -e '
while (<IN>) {
@arr= split /\t/,$_;
$arr[4] =~ s/ //g;
$arr[4] = sprintf("%020s", $arr[4]);
$spkid = "$arr[0]_$arr[4]";
$spkfix = sprintf("%060s", $spkid);
$start=sprintf ("%0.3f",$arr[2]);$rStart=$start;$start=~s/\.//; $start=~s/^0+$/0/; $start=~s/^0+([^0])/$1/; # remove zeros at the beginning
$end=sprintf ("%0.3f",$arr[3]);$rEnd=$end;$end=~s/^0+([^0])/$1/;$end=~s/\.//;
$id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
next if ($rStart == $rEnd);
$id =~ s/.sph//g;
print ID $id;
$spkfix = sprintf("%080s", $spkid);

$start=sprintf ("%0.3f",$arr[2]);
$rStart=$start;
$start=~s/\.//;
$start=~s/^0+$/0/;
$start=~s/^0+([^0])/$1/; # remove zeros at the beginning
$start = sprintf("%09s", $start);

$end=sprintf ("%0.3f",$arr[3]);
$rEnd=$end;
$end=~s/^0+([^0])/$1/;
$end=~s/\.//;
$end = sprintf("%09s", $end);

$id="$arr[11] $arr[0] ${spkfix}_$arr[0]_${start}_${end} $rStart $rEnd\n";
next if ($rStart == $rEnd);
$id =~ s/.sph//g;
print ID $id;
print TXT "$arr[7]\n";
print SPK "${spkfix}_$arr[0]_${start}_${end} ${spkfix}\n";
print MAP "$arr[0] ${spkfix}_$arr[0]\n";
}' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"
}' "all.tmp allid.tmp contentall.tmp utt2spk.tmp map.tmp"

perl -p -i -e 's=/.$==g' contentall.tmp

cd $cur
cd $top_pwd


pyver=`python --version 2>&1 | sed -e 's:.*\([2-3]\.[0-9]\+\).*:\1:g'`
export PYTHONPATH=$PYTHONPATH:`pwd`/tools/mmseg-1.3.0/lib/python${pyver}/site-packages
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
echo "--- Downloading mmseg-1.3.0 ..."
echo "NOTE: it assumes that you have Python, Setuptools installed on your system!"
wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
wget -P tools http://pypi.python.org/packages/source/m/mmseg/mmseg-1.3.0.tar.gz
tar xf tools/mmseg-1.3.0.tar.gz -C tools
cd tools/mmseg-1.3.0
mkdir -p lib/python${pyver}/site-packages
python setup.py build
CC=gcc CXX=g++ python setup.py build
python setup.py install --prefix=.
cd ../..
if [ ! -d tools/mmseg-1.3.0/lib/python${pyver}/site-packages ]; then
Expand Down Expand Up @@ -90,11 +113,8 @@ awk '{$1="";print $0}' $txtdir/all_1.tmp | sed 's:^ ::' > $txtdir/../all
cat $txtdir/utt2spk.tmp | sort -u > $txtdir/../utt2spk
cat $txtdir/map.tmp | sort -u > $txtdir/../map

sort -c $txtdir/../utt2spk
sort -c $txtdir/../utt2spk

utils/utt2spk_to_spk2utt.pl $txtdir/../utt2spk | sort -u > $txtdir/../spk2utt

cd ..;
rm -fr $txtdir

echo data prep text succeeded
Loading