Skip to content

Commit c69d6c3

Browse files
author
kkm
committed
[scripts] Trust frame_shift and utt2num_frames if found
Getting utt2dur involves accessing wave files, and potentially running full pipelines in wav.scp, which may take hours for a large data set. If utt2num_frames exists, use it instead if frame rate is known. Issue: kaldi-asr#3303 Fixes: kaldi-asr#3297 "cat: broken pipe"
1 parent ba165c8 commit c69d6c3

File tree

2 files changed

+36
-27
lines changed

2 files changed

+36
-27
lines changed

egs/wsj/s5/utils/data/get_frame_shift.sh

+23-24
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ if [ $# != 1 ]; then
1818
echo "e.g.:"
1919
echo " $0 data/train"
2020
echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
21-
echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
21+
echo "If <datadir> does not contain frame_shift or utt2dur, this script may call utils/data/get_utt2dur.sh,"
2222
echo "which will require write permission to <datadir>"
2323
exit 1
2424
fi
@@ -27,6 +27,15 @@ export LC_ALL=C
2727

2828
dir=$1
2929

30+
if [[ -s $dir/frame_shift ]]; then
31+
cat $dir/frame_shift
32+
exit
33+
fi
34+
35+
if [ ! -f $dir/feats.scp ]; then
36+
echo "$0: $dir/feats.scp does not exist" 1>&2
37+
exit 1
38+
fi
3039

3140
if [ ! -s $dir/utt2dur ]; then
3241
if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
@@ -38,34 +47,24 @@ if [ ! -s $dir/utt2dur ]; then
3847
utils/data/get_utt2dur.sh $dir 1>&2
3948
fi
4049

41-
if [ ! -s $dir/frame_shift ]; then
42-
if [ ! -f $dir/feats.scp ]; then
43-
echo "$0: $dir/feats.scp does not exist" 1>&2
44-
exit 1
45-
fi
46-
47-
temp=$(mktemp /tmp/tmp.XXXX)
48-
49-
feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
50+
temp=$(mktemp /tmp/tmp.XXXX)
5051

51-
if [ -z $temp ]; then
52-
echo "$0: error running feat-to-len" 1>&2
53-
exit 1
54-
fi
55-
56-
frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
57-
awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;
52+
feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
5853

59-
echo $frame_shift > $dir/frame_shift
54+
if [[ ! -s $temp ]]; then
6055
rm $temp
61-
fi
62-
63-
frame_shift=$(cat $dir/frame_shift)
64-
if [ -z "$frame_shift" ]; then
65-
echo "$0: Could not read get frame shift from directory $dir" 1>&2
56+
echo "$0: error running feat-to-len" 1>&2
6657
exit 1
6758
fi
6859

69-
echo $frame_shift
60+
frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
61+
{ dur += $2; frames += $4; }
62+
END { shift = dur / frames;
63+
if (shift > 0.01 && shift < 0.0102) shift = 0.01;
64+
print shift; }') || exit 1;
65+
66+
rm $temp
7067

68+
echo $frame_shift > $dir/frame_shift
69+
echo $frame_shift
7170
exit 0

egs/wsj/s5/utils/data/get_utt2dur.sh

+13-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ if [ $# != 1 ]; then
2323
echo " $0 data/train"
2424
echo " Options:"
2525
echo " --frame-shift # frame shift in seconds. Only relevant when we are"
26-
echo " # getting duration from feats.scp (default: 0.01). "
26+
echo " # getting duration from feats.scp, and only if the "
27+
echo " # file frame_shift does not exist (default: 0.01). "
2728
exit 1
2829
fi
2930

@@ -40,12 +41,16 @@ fi
4041
if [ -s $data/segments ]; then
4142
echo "$0: working out $data/utt2dur from $data/segments"
4243
awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur
44+
elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
45+
echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
46+
frame_shift=$(cat $data/frame_shift) || exit 1
47+
awk -v fs=$frame_shift '{ $2 *= fs; print }' <$data/utt2num_frames >$data/utt2dur
4348
elif [ -f $data/wav.scp ]; then
4449
echo "$0: segments file does not exist so getting durations from wave files"
4550

4651
# if the wav.scp contains only lines of the form
4752
# utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph |
48-
if cat $data/wav.scp | perl -e '
53+
if perl <$data/wav.scp -e '
4954
while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space.
5055
@A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
5156
$A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
@@ -102,7 +107,12 @@ elif [ -f $data/wav.scp ]; then
102107
fi
103108
elif [ -f $data/feats.scp ]; then
104109
echo "$0: wave file does not exist so getting durations from feats files"
105-
feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
110+
if [[ -s $data/frame_shift ]]; then
111+
frame_shift=$(cat $data/frame_shift) || exit 1
112+
echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
113+
fi
114+
feat-to-len scp:$data/feats.scp ark,t:- |
115+
awk -v frame_shift=$frame_shift '{print $1, $2 * frame_shift}' >$data/utt2dur
106116
else
107117
echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
108118
exit 1

0 commit comments

Comments
 (0)