Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion egs/wsj/s5/utils/combine_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ else
echo "$0 [info]: not combining segments as it does not exist"
fi

for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do
exists_somewhere=false
absent_somewhere=false
for d in $*; do
Expand Down
7 changes: 7 additions & 0 deletions egs/wsj/s5/utils/copy_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,13 @@ fi
if [ -f $srcdir/utt2dur ]; then
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur
fi
if [ -f $srcdir/reco2dur ]; then
if [ -f $srcdir/segments ]; then
cp $srcdir/reco2dur $destdir/reco2dur
else
utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur
fi
fi
if [ -f $srcdir/spk2gender ]; then
utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender
fi
Expand Down
143 changes: 143 additions & 0 deletions egs/wsj/s5/utils/data/get_reco2dur.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#!/bin/bash

# Copyright 2016 Johns Hopkins University (author: Daniel Povey)
# 2018 Andrea Carmantini
# Apache 2.0

# This script operates on a data directory, such as in data/train/, and adds the
# reco2dur file if it does not already exist. The file 'reco2dur' maps from
# recording to the duration of the recording in seconds. This script works it
# out from the 'wav.scp' file, or, if utterance-ids are the same as recording-ids, from the
# utt2dur file (it first tries interrogating the headers, and if this fails, it reads the wave
# files in entirely.)
# We could use durations from segments file, but that's not the duration of the recordings
# but the sum of utterance lenghts (silence in between could be excluded from segments)
# For sum of utterance lenghts:
# awk 'FNR==NR{uttdur[$1]=$2;next}
# { for(i=2;i<=NF;i++){dur+=uttdur[$i];}
# print $1 FS dur; dur=0 }' $data/utt2dur $data/reco2utt


frame_shift=0.01
cmd=run.pl
nj=4

. utils/parse_options.sh
. ./path.sh

if [ $# != 1 ]; then
echo "Usage: $0 [options] <datadir>"
echo "e.g.:"
echo " $0 data/train"
echo " Options:"
echo " --frame-shift # frame shift in seconds. Only relevant when we are"
echo " # getting duration from feats.scp (default: 0.01). "
exit 1
fi

export LC_ALL=C

data=$1


if [ -s $data/reco2dur ] && \
[ $(wc -l < $data/wav.scp) -eq $(wc -l < $data/reco2dur) ]; then
echo "$0: $data/reco2dur already exists with the expected length. We won't recompute it."
exit 0;
fi

if [ -s $data/utt2dur ] && \
[ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ] && \
[ ! -s $data/segments ]; then

echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur"
cp $data/utt2dur $data/reco2utt && exit 0;

elif [ -f $data/wav.scp ]; then
echo "$0: obtaining durations from recordings"

# if the wav.scp contains only lines of the form
# utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph |
if cat $data/wav.scp | perl -e '
while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space.
@A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
$A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
$reco = $A[0]; $sphere_file = $A[4];

if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; }
$sample_rate = -1; $sample_count = -1;
for ($n = 0; $n <= 30; $n++) {
$line = <F>;
if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; }
if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; }
if ($line =~ m/end_head/) { break; }
}
close(F);
if ($sample_rate == -1 || $sample_count == -1) {
die "could not parse sphere header from $sphere_file";
}
$duration = $sample_count * 1.0 / $sample_rate;
print "$reco $duration\n";
} ' > $data/reco2dur; then
echo "$0: successfully obtained recording lengths from sphere-file headers"
else
echo "$0: could not get recording lengths from sphere-file headers, using wav-to-duration"
if ! command -v wav-to-duration >/dev/null; then
echo "$0: wav-to-duration is not on your path"
exit 1;
fi

read_entire_file=false
if grep -q 'sox.*speed' $data/wav.scp; then
read_entire_file=true
echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or "
echo "... perturb_data_dir_speed_3way.sh."
fi

num_recos=$(wc -l <$data/wav.scp)
if [ $nj -gt $num_recos ]; then
nj=$num_recos
fi

temp_data_dir=$data/wav${nj}split
wavscps=$(for n in `seq $nj`; do echo $temp_data_dir/$n/wav.scp; done)
subdirs=$(for n in `seq $nj`; do echo $temp_data_dir/$n; done)

if ! mkdir -p $subdirs >&/dev/null; then
for n in `seq $nj`; do
mkdir -p $temp_data_dir/$n
done
fi

utils/split_scp.pl $data/wav.scp $wavscps


$cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \
wav-to-duration --read-entire-file=$read_entire_file \
scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \
{ echo "$0: there was a problem getting the durations"; exit 1; } # This could

for n in `seq $nj`; do
cat $temp_data_dir/$n/reco2dur
done > $data/reco2dur
fi
rm -r $temp_data_dir
else
echo "$0: Expected $data/wav.scp to exist"
exit 1
fi

len1=$(wc -l < $data/wav.scp)
len2=$(wc -l < $data/reco2dur)
if [ "$len1" != "$len2" ]; then
echo "$0: warning: length of reco2dur does not equal that of wav.scp, $len2 != $len1"
if [ $len1 -gt $[$len2*2] ]; then
echo "$0: less than half of recordings got a duration: failing."
exit 1
fi
fi

echo "$0: computed $data/reco2dur"

exit 0
10 changes: 5 additions & 5 deletions egs/wsj/s5/utils/data/get_utt2dur.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ export LC_ALL=C
data=$1

if [ -s $data/utt2dur ] && \
[ $(cat $data/utt2spk | wc -l) -eq $(cat $data/utt2dur | wc -l) ]; then
[ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ]; then
echo "$0: $data/utt2dur already exists with the expected length. We won't recompute it."
exit 0;
fi

if [ -s $data/segments ]; then
echo "$0: working out $data/utt2dur from $data/segments"
cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur
awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur
elif [ -f $data/wav.scp ]; then
echo "$0: segments file does not exist so getting durations from wave files"

Expand Down Expand Up @@ -75,7 +75,7 @@ elif [ -f $data/wav.scp ]; then
fi

read_entire_file=false
if cat $data/wav.scp | grep -q 'sox.*speed'; then
if grep -q 'sox.*speed' $data/wav.scp; then
read_entire_file=true
echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow."
echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or "
Expand Down Expand Up @@ -108,8 +108,8 @@ else
exit 1
fi

len1=$(cat $data/utt2spk | wc -l)
len2=$(cat $data/utt2dur | wc -l)
len1=$(wc -l < $data/utt2spk)
len2=$(wc -l < $data/utt2dur)
if [ "$len1" != "$len2" ]; then
echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1"
if [ $len1 -gt $[$len2*2] ]; then
Expand Down
7 changes: 4 additions & 3 deletions egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ if [ -f $destdir/feats.scp ]; then
exit 1
fi

echo "$0: making sure the utt2dur file is present in ${srcdir}, because "
echo "... obtaining it after speed-perturbing would be very slow, and"
echo "... you might need it."
echo "$0: making sure the utt2dur and the reco2dur files are present"
echo "... in ${srcdir}, because obtaining it after speed-perturbing"
echo "... would be very slow, and you might need them."
utils/data/get_utt2dur.sh ${srcdir}
utils/data/get_reco2dur.sh ${srcdir}

utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1
utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1
Expand Down
10 changes: 6 additions & 4 deletions egs/wsj/s5/utils/fix_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ function check_sorted {
}

for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \
reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur utt2num_frames; do
reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
check_sorted $data/$x
Expand Down Expand Up @@ -97,7 +97,7 @@ function filter_recordings {

filter_file $tmpdir/recordings $data/wav.scp
[ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel
true
[ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur
fi
}

Expand Down Expand Up @@ -143,7 +143,9 @@ function filter_utts {
fi

maybe_wav=
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
maybe_reco2dur=
[ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist.
[ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts
for x in feats.scp text segments utt2lang $maybe_wav; do
if [ -f $data/$x ]; then
utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp
Expand All @@ -164,7 +166,7 @@ function filter_utts {
fi
fi

for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $utt_extra_files; do
for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do
if [ -f $data/$x ]; then
cp $data/$x $data/.backup/$x
if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then
Expand Down
23 changes: 23 additions & 0 deletions egs/wsj/s5/utils/validate_data_dir.sh
Original file line number Diff line number Diff line change
Expand Up @@ -337,4 +337,27 @@ if [ -f $data/utt2dur ]; then
fi


if [ -f $data/reco2dur ]; then
check_sorted_and_uniq $data/reco2dur
cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur
if [ -f $tempdir/recordings ]; then
if ! cmp -s $tmpdir/recordings{,.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/recordings{,.reco2dur}
exit 1;
fi
else
if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then
echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file"
echo "$0: differ, partial diff is:"
partial_diff $tmpdir/{utts,recordings.reco2dur}
exit 1;
fi
fi
cat $data/reco2dur | \
awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1
fi


echo "$0: Successfully validated data-directory $data"