Skip to content

Commit 94aef8d

Browse files
kkm (aka Kirill Katsnelson)danpovey
kkm (aka Kirill Katsnelson)
authored andcommitted
[scripts] Trust frame_shift and utt2num_frames if found (kaldi-asr#3313)
Getting utt2dur involves accessing wave files, and potentially running full pipelines in wav.scp, which may take hours for a large data set. If utt2num_frames exists, use it instead if frame rate is known. Issue: kaldi-asr#3303 Fixes: kaldi-asr#3297 "cat: broken pipe"
1 parent 9569384 commit 94aef8d

File tree

2 files changed

+48
-33
lines changed

2 files changed

+48
-33
lines changed

egs/wsj/s5/utils/data/get_frame_shift.sh

+33-30
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,32 @@
1414
. ./path.sh
1515

1616
if [ $# != 1 ]; then
17-
echo "Usage: $0 <datadir>"
18-
echo "e.g.:"
19-
echo " $0 data/train"
20-
echo "This script prints the frame-shift (e.g. 0.01) to the standard out."
21-
echo "If <datadir> does not contain utt2dur, this script may call utils/data/get_utt2dur.sh,"
22-
echo "which will require write permission to <datadir>"
17+
cat >&2 <<EOF
18+
Usage: frame_shift=\$($0 <datadir>)
19+
e.g.: frame_shift=\$($0 data/train)
20+
21+
This script prints the frame-shift in seconds (e.g. 0.01) to the standard out.
22+
Its output is intended to be captured in a shell variable.
23+
24+
If <datadir> does not contain the file utt2dur, this script may invoke
25+
utils/data/get_utt2dur.sh, which will require write permission to <datadir>.
26+
EOF
2327
exit 1
2428
fi
2529

2630
export LC_ALL=C
2731

2832
dir=$1
2933

34+
if [[ -s $dir/frame_shift ]]; then
35+
cat $dir/frame_shift
36+
exit
37+
fi
38+
39+
if [ ! -f $dir/feats.scp ]; then
40+
echo "$0: $dir/feats.scp does not exist" 1>&2
41+
exit 1
42+
fi
3043

3144
if [ ! -s $dir/utt2dur ]; then
3245
if [ ! -e $dir/wav.scp ] && [ ! -s $dir/segments ]; then
@@ -35,37 +48,27 @@ if [ ! -s $dir/utt2dur ]; then
3548
exit 0
3649
fi
3750
echo "$0: $dir/utt2dur does not exist: creating it" 1>&2
38-
utils/data/get_utt2dur.sh $dir 1>&2
51+
utils/data/get_utt2dur.sh 1>&2 $dir || exit 1
3952
fi
4053

41-
if [ ! -s $dir/frame_shift ]; then
42-
if [ ! -f $dir/feats.scp ]; then
43-
echo "$0: $dir/feats.scp does not exist" 1>&2
44-
exit 1
45-
fi
46-
47-
temp=$(mktemp /tmp/tmp.XXXX)
54+
temp=$(mktemp /tmp/tmp.XXXX) || exit 1
4855

49-
feat-to-len "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
56+
feat-to-len --print-args=false "scp:head -n 10 $dir/feats.scp|" ark,t:- > $temp
5057

51-
if [ -z $temp ]; then
52-
echo "$0: error running feat-to-len" 1>&2
53-
exit 1
54-
fi
55-
56-
frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | \
57-
awk '{ dur += $2; frames += $4; } END { shift = dur / frames; if (shift > 0.01 && shift < 0.0102) shift = 0.01; print shift; }') || exit 1;
58-
59-
echo $frame_shift > $dir/frame_shift
58+
if [[ ! -s $temp ]]; then
6059
rm $temp
61-
fi
62-
63-
frame_shift=$(cat $dir/frame_shift)
64-
if [ -z "$frame_shift" ]; then
65-
echo "$0: Could not read get frame shift from directory $dir" 1>&2
60+
echo "$0: error running feat-to-len" 1>&2
6661
exit 1
6762
fi
6863

69-
echo $frame_shift
64+
frame_shift=$(head -n 10 $dir/utt2dur | paste - $temp | awk '
65+
{ dur += $2; frames += $4; }
66+
END { shift = dur / frames;
67+
if (shift > 0.01 && shift < 0.0102) shift = 0.01;
68+
print shift; }') || exit 1;
7069

70+
rm $temp
71+
72+
echo $frame_shift > $dir/frame_shift
73+
echo $frame_shift
7174
exit 0

egs/wsj/s5/utils/data/get_utt2dur.sh

+15-3
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ if [ $# != 1 ]; then
2323
echo " $0 data/train"
2424
echo " Options:"
2525
echo " --frame-shift # frame shift in seconds. Only relevant when we are"
26-
echo " # getting duration from feats.scp (default: 0.01). "
26+
echo " # getting duration from feats.scp, and only if the "
27+
echo " # file frame_shift does not exist (default: 0.01). "
2728
exit 1
2829
fi
2930

@@ -40,12 +41,17 @@ fi
4041
if [ -s $data/segments ]; then
4142
echo "$0: working out $data/utt2dur from $data/segments"
4243
awk '{len=$4-$3; print $1, len;}' < $data/segments > $data/utt2dur
44+
elif [[ -s $data/frame_shift && -f $data/utt2num_frames ]]; then
45+
echo "$0: computing $data/utt2dur from $data/{frame_shift,utt2num_frames}."
46+
frame_shift=$(cat $data/frame_shift) || exit 1
47+
# The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
48+
awk -v fs=$frame_shift '{ $2=($2+1.5)*fs; print }' <$data/utt2num_frames >$data/utt2dur
4349
elif [ -f $data/wav.scp ]; then
4450
echo "$0: segments file does not exist so getting durations from wave files"
4551

4652
# if the wav.scp contains only lines of the form
4753
# utt1 /foo/bar/sph2pipe -f wav /baz/foo.sph |
48-
if cat $data/wav.scp | perl -e '
54+
if perl <$data/wav.scp -e '
4955
while (<>) { s/\|\s*$/ |/; # make sure final | is preceded by space.
5056
@A = split; if (!($#A == 5 && $A[1] =~ m/sph2pipe$/ &&
5157
$A[2] eq "-f" && $A[3] eq "wav" && $A[5] eq "|")) { exit(1); }
@@ -102,7 +108,13 @@ elif [ -f $data/wav.scp ]; then
102108
fi
103109
elif [ -f $data/feats.scp ]; then
104110
echo "$0: wave file does not exist so getting durations from feats files"
105-
feat-to-len scp:$data/feats.scp ark,t:- | awk -v frame_shift=$frame_shift '{print $1, $2*frame_shift;}' >$data/utt2dur
111+
if [[ -s $data/frame_shift ]]; then
112+
frame_shift=$(cat $data/frame_shift) || exit 1
113+
echo "$0: using frame_shift=$frame_shift from file $data/frame_shift"
114+
fi
115+
# The 1.5 correction is the typical value of (frame_length-frame_shift)/frame_shift.
116+
feat-to-len scp:$data/feats.scp ark,t:- |
117+
awk -v frame_shift=$frame_shift '{print $1, ($2+1.5)*frame_shift}' >$data/utt2dur
106118
else
107119
echo "$0: Expected $data/wav.scp, $data/segments or $data/feats.scp to exist"
108120
exit 1

0 commit comments

Comments
 (0)