diff --git a/egs/wsj/s5/utils/combine_data.sh b/egs/wsj/s5/utils/combine_data.sh index 1dc3da6b742..a43cf9d77f3 100755 --- a/egs/wsj/s5/utils/combine_data.sh +++ b/egs/wsj/s5/utils/combine_data.sh @@ -94,7 +94,7 @@ else echo "$0 [info]: not combining segments as it does not exist" fi -for file in utt2spk utt2lang utt2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do +for file in utt2spk utt2lang utt2dur reco2dur feats.scp text cmvn.scp vad.scp reco2file_and_channel wav.scp spk2gender $extra_files; do exists_somewhere=false absent_somewhere=false for d in $*; do diff --git a/egs/wsj/s5/utils/copy_data_dir.sh b/egs/wsj/s5/utils/copy_data_dir.sh index 5b0b3946d25..f3b885c5e79 100755 --- a/egs/wsj/s5/utils/copy_data_dir.sh +++ b/egs/wsj/s5/utils/copy_data_dir.sh @@ -103,6 +103,13 @@ fi if [ -f $srcdir/utt2dur ]; then utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/utt2dur >$destdir/utt2dur fi +if [ -f $srcdir/reco2dur ]; then + if [ -f $srcdir/segments ]; then + cp $srcdir/reco2dur $destdir/reco2dur + else + utils/apply_map.pl -f 1 $destdir/utt_map <$srcdir/reco2dur >$destdir/reco2dur + fi +fi if [ -f $srcdir/spk2gender ]; then utils/apply_map.pl -f 1 $destdir/spk_map <$srcdir/spk2gender >$destdir/spk2gender fi diff --git a/egs/wsj/s5/utils/data/get_reco2dur.sh b/egs/wsj/s5/utils/data/get_reco2dur.sh new file mode 100755 index 00000000000..4b45c6f00a3 --- /dev/null +++ b/egs/wsj/s5/utils/data/get_reco2dur.sh @@ -0,0 +1,143 @@ +#!/bin/bash + +# Copyright 2016 Johns Hopkins University (author: Daniel Povey) +# 2018 Andrea Carmantini +# Apache 2.0 + +# This script operates on a data directory, such as in data/train/, and adds the +# reco2dur file if it does not already exist. The file 'reco2dur' maps from +# recording to the duration of the recording in seconds. This script works it +# out from the 'wav.scp' file, or, if utterance-ids are the same as recording-ids, from the +# utt2dur file (it first tries interrogating the headers, and if this fails, it reads the wave +# files in entirely.) +# We could use durations from segments file, but that's not the duration of the recordings +# but the sum of utterance lenghts (silence in between could be excluded from segments) +# For sum of utterance lenghts: +# awk 'FNR==NR{uttdur[$1]=$2;next} +# { for(i=2;i<=NF;i++){dur+=uttdur[$i];} +# print $1 FS dur; dur=0 }' $data/utt2dur $data/reco2utt + + +frame_shift=0.01 +cmd=run.pl +nj=4 + +. utils/parse_options.sh +. ./path.sh + +if [ $# != 1 ]; then + echo "Usage: $0 [options] " + echo "e.g.:" + echo " $0 data/train" + echo " Options:" + echo " --frame-shift # frame shift in seconds. Only relevant when we are" + echo " # getting duration from feats.scp (default: 0.01). " + exit 1 +fi + +export LC_ALL=C + +data=$1 + + +if [ -s $data/reco2dur ] && \ + [ $(wc -l < $data/wav.scp) -eq $(wc -l < $data/reco2dur) ]; then + echo "$0: $data/reco2dur already exists with the expected length. We won't recompute it." + exit 0; +fi + +if [ -s $data/utt2dur ] && \ + [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ] && \ + [ ! -s $data/segments ]; then + + echo "$0: $data/wav.scp indexed by utt-id; copying utt2dur to reco2dur" + cp $data/utt2dur $data/reco2utt && exit 0; + +elif [ -f $data/wav.scp ]; then + echo "$0: obtaining durations from recordings" + + # if the wav.scp contains only lines of the form + # utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph | + if cat $data/wav.scp | perl -e ' + while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space. + @A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ && + $A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); } + $reco = $A[0]; $sphere_file = $A[4]; + + if (!open(F, "<$sphere_file")) { die "Error opening sphere file $sphere_file"; } + $sample_rate = -1; $sample_count = -1; + for ($n = 0; $n <= 30; $n++) { + $line = ; + if ($line =~ m/sample_rate -i (\d+)/) { $sample_rate = $1; } + if ($line =~ m/sample_count -i (\d+)/) { $sample_count = $1; } + if ($line =~ m/end_head/) { break; } + } + close(F); + if ($sample_rate == -1 || $sample_count == -1) { + die "could not parse sphere header from $sphere_file"; + } + $duration = $sample_count * 1.0 / $sample_rate; + print "$reco $duration\n"; + } ' > $data/reco2dur; then + echo "$0: successfully obtained recording lengths from sphere-file headers" + else + echo "$0: could not get recording lengths from sphere-file headers, using wav-to-duration" + if ! command -v wav-to-duration >/dev/null; then + echo "$0: wav-to-duration is not on your path" + exit 1; + fi + + read_entire_file=false + if grep -q 'sox.*speed' $data/wav.scp; then + read_entire_file=true + echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." + echo "... It is much faster if you call get_reco2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " + echo "... perturb_data_dir_speed_3way.sh." + fi + + num_recos=$(wc -l <$data/wav.scp) + if [ $nj -gt $num_recos ]; then + nj=$num_recos + fi + + temp_data_dir=$data/wav${nj}split + wavscps=$(for n in `seq $nj`; do echo $temp_data_dir/$n/wav.scp; done) + subdirs=$(for n in `seq $nj`; do echo $temp_data_dir/$n; done) + + if ! mkdir -p $subdirs >&/dev/null; then + for n in `seq $nj`; do + mkdir -p $temp_data_dir/$n + done + fi + + utils/split_scp.pl $data/wav.scp $wavscps + + + $cmd JOB=1:$nj $data/log/get_reco_durations.JOB.log \ + wav-to-duration --read-entire-file=$read_entire_file \ + scp:$temp_data_dir/JOB/wav.scp ark,t:$temp_data_dir/JOB/reco2dur || \ + { echo "$0: there was a problem getting the durations"; exit 1; } # This could + + for n in `seq $nj`; do + cat $temp_data_dir/$n/reco2dur + done > $data/reco2dur + fi + rm -r $temp_data_dir +else + echo "$0: Expected $data/wav.scp to exist" + exit 1 +fi + +len1=$(wc -l < $data/wav.scp) +len2=$(wc -l < $data/reco2dur) +if [ "$len1" != "$len2" ]; then + echo "$0: warning: length of reco2dur does not equal that of wav.scp, $len2 != $len1" + if [ $len1 -gt $[$len2*2] ]; then + echo "$0: less than half of recordings got a duration: failing." + exit 1 + fi +fi + +echo "$0: computed $data/reco2dur" + +exit 0 diff --git a/egs/wsj/s5/utils/data/get_utt2dur.sh b/egs/wsj/s5/utils/data/get_utt2dur.sh index 800cac81d18..995136a5575 100755 --- a/egs/wsj/s5/utils/data/get_utt2dur.sh +++ b/egs/wsj/s5/utils/data/get_utt2dur.sh @@ -32,14 +32,14 @@ export LC_ALL=C data=$1 if [ -s $data/utt2dur ] && \ - [ $(cat $data/utt2spk | wc -l) -eq $(cat $data/utt2dur | wc -l) ]; then + [ $(wc -l < $data/utt2spk) -eq $(wc -l < $data/utt2dur) ]; then echo "$0: $data/utt2dur already exists with the expected length. We won't recompute it." exit 0; fi if [ -s $data/segments ]; then echo "$0: working out $data/utt2dur from $data/segments" - cat $data/segments | awk '{len=$4-$3; print $1, len;}' > $data/utt2dur + awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur elif [ -f $data/wav.scp ]; then echo "$0: segments file does not exist so getting durations from wave files" @@ -75,7 +75,7 @@ elif [ -f $data/wav.scp ]; then fi read_entire_file=false - if cat $data/wav.scp | grep -q 'sox.*speed'; then + if grep -q 'sox.*speed' $data/wav.scp; then read_entire_file=true echo "$0: reading from the entire wav file to fix the problem caused by sox commands with speed perturbation. It is going to be slow." echo "... It is much faster if you call get_utt2dur.sh *before* doing the speed perturbation via e.g. perturb_data_dir_speed.sh or " @@ -108,8 +108,8 @@ else exit 1 fi -len1=$(cat $data/utt2spk | wc -l) -len2=$(cat $data/utt2dur | wc -l) +len1=$(wc -l < $data/utt2spk) +len2=$(wc -l < $data/utt2dur) if [ "$len1" != "$len2" ]; then echo "$0: warning: length of utt2dur does not equal that of utt2spk, $len2 != $len1" if [ $len1 -gt $[$len2*2] ]; then diff --git a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh index f857ae2bdd7..cd291427398 100755 --- a/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh +++ b/egs/wsj/s5/utils/data/perturb_data_dir_speed_3way.sh @@ -47,10 +47,11 @@ if [ -f $destdir/feats.scp ]; then exit 1 fi -echo "$0: making sure the utt2dur file is present in ${srcdir}, because " -echo "... obtaining it after speed-perturbing would be very slow, and" -echo "... you might need it." +echo "$0: making sure the utt2dur and the reco2dur files are present" +echo "... in ${srcdir}, because obtaining it after speed-perturbing" +echo "... would be very slow, and you might need them." utils/data/get_utt2dur.sh ${srcdir} +utils/data/get_reco2dur.sh ${srcdir} utils/data/perturb_data_dir_speed.sh 0.9 ${srcdir} ${destdir}_speed0.9 || exit 1 utils/data/perturb_data_dir_speed.sh 1.1 ${srcdir} ${destdir}_speed1.1 || exit 1 diff --git a/egs/wsj/s5/utils/fix_data_dir.sh b/egs/wsj/s5/utils/fix_data_dir.sh index 8ebfc8d49fe..10bcb79299a 100755 --- a/egs/wsj/s5/utils/fix_data_dir.sh +++ b/egs/wsj/s5/utils/fix_data_dir.sh @@ -46,7 +46,7 @@ function check_sorted { } for x in utt2spk spk2utt feats.scp text segments wav.scp cmvn.scp vad.scp \ - reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur utt2num_frames; do + reco2file_and_channel spk2gender utt2lang utt2uniq utt2dur reco2dur utt2num_frames; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x check_sorted $data/$x @@ -97,7 +97,7 @@ function filter_recordings { filter_file $tmpdir/recordings $data/wav.scp [ -f $data/reco2file_and_channel ] && filter_file $tmpdir/recordings $data/reco2file_and_channel - true + [ -f $data/reco2dur ] && filter_file $tmpdir/recordings $data/reco2dur fi } @@ -143,7 +143,9 @@ function filter_utts { fi maybe_wav= - [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. + maybe_reco2dur= + [ ! -f $data/segments ] && maybe_wav=wav.scp # wav indexed by utts only if segments does not exist. + [ -s $data/reco2dur ] && [ ! -f $data/segments ] && maybe_reco2dur=reco2dur # reco2dur indexed by utts for x in feats.scp text segments utt2lang $maybe_wav; do if [ -f $data/$x ]; then utils/filter_scp.pl $data/$x $tmpdir/utts > $tmpdir/utts.tmp @@ -164,7 +166,7 @@ function filter_utts { fi fi - for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $utt_extra_files; do + for x in utt2spk utt2uniq feats.scp vad.scp text segments utt2lang utt2dur utt2num_frames $maybe_wav $maybe_reco2dur $utt_extra_files; do if [ -f $data/$x ]; then cp $data/$x $data/.backup/$x if ! cmp -s $data/$x <( utils/filter_scp.pl $tmpdir/utts $data/$x ) ; then diff --git a/egs/wsj/s5/utils/validate_data_dir.sh b/egs/wsj/s5/utils/validate_data_dir.sh index dbbaeb10d5d..263e0218bc0 100755 --- a/egs/wsj/s5/utils/validate_data_dir.sh +++ b/egs/wsj/s5/utils/validate_data_dir.sh @@ -337,4 +337,27 @@ if [ -f $data/utt2dur ]; then fi +if [ -f $data/reco2dur ]; then + check_sorted_and_uniq $data/reco2dur + cat $data/reco2dur | awk '{print $1}' > $tmpdir/recordings.reco2dur + if [ -f $tempdir/recordings ]; then + if ! cmp -s $tmpdir/recordings{,.reco2dur}; then + echo "$0: Error: in $data, recording-ids extracted from segments and reco2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/recordings{,.reco2dur} + exit 1; + fi + else + if ! cmp -s $tmpdir/{utts,recordings.reco2dur}; then + echo "$0: Error: in $data, recording-ids extracted from wav.scp and reco2dur file" + echo "$0: differ, partial diff is:" + partial_diff $tmpdir/{utts,recordings.reco2dur} + exit 1; + fi + fi + cat $data/reco2dur | \ + awk '{ if (NF != 2 || !($2 > 0)) { print "Bad line : " $0; exit(1) }}' || exit 1 +fi + + echo "$0: Successfully validated data-directory $data"