Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 33 additions & 32 deletions egs/wsj/s5/steps/nnet3/chain/get_egs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,36 +151,40 @@ mkdir -p $dir/log $dir/info
# Get list of validation utterances.
frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1

if [ -f $data/utt2uniq ]; then
# Must hold out all augmented versions of the same utterance.
echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \
"includes all perturbed versions of the same source utterance."
utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null |
awk -v max_utt=$num_utts_subset '{
for (n=2;n<=NF;n++) print $n;
printed += NF-1;
if (printed >= max_utt) nextfile; }' |
sort > $dir/valid_uttlist
else
awk '{print $1}' $data/utt2spk | \
utils/shuffle_list.pl 2>/dev/null | \
head -$num_utts_subset > $dir/valid_uttlist
fi
len_valid_uttlist=$(wc -l < $dir/valid_uttlist)

awk '{print $1}' $data/utt2spk | \
utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist
utils/filter_scp.pl --exclude $dir/valid_uttlist | \
utils/shuffle_list.pl 2>/dev/null | \
head -$num_utts_subset > $dir/train_subset_uttlist
len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist)

len_uttlist=$(wc -l < $dir/valid_uttlist)
if [ $len_uttlist -lt $num_utts_subset ]; then
echo "Number of utterances is very small. Please check your data." && exit 1;
if [[ $len_valid_uttlist -lt $num_utts_subset ||
$len_trainsub_uttlist -lt $num_utts_subset ]]; then
echo "$0: Number of utterances is very small. Please check your data." && exit 1;
fi

if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation.
# because of this stage we can again have utts with lengths less than
# frames_per_eg
echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
echo "include all perturbed versions of the same 'real' utterances."
mv $dir/valid_uttlist $dir/valid_uttlist.tmp
utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt
cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
sort | uniq | utils/apply_map.pl $dir/uniq2utt | \
awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist
rm $dir/uniq2utt $dir/valid_uttlist.tmp
fi
echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \
"$len_trainsub_uttlist in training diagnostic set, out of total" \
"$(wc -l < $data/utt2spk)."

echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"

awk '{print $1}' $data/utt2spk | \
utils/filter_scp.pl --exclude $dir/valid_uttlist | \
utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist
len_uttlist=$(wc -l <$dir/train_subset_uttlist)
if [ $len_uttlist -lt $num_utts_subset ]; then
echo "Number of utterances is very small. Please check your data." && exit 1;
fi
echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete"

## Set up features.
echo "$0: feature type is raw"
Expand Down Expand Up @@ -342,9 +346,8 @@ if [ $stage -le 2 ]; then
$egs_opts --normalization-fst-scale=$normalization_fst_scale \
$trans_mdl_opt $chaindir/normalization.fst \
"$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1
wait
sleep 5 # wait for file system to sync.
echo "... Getting subsets of validation examples for diagnostics and combination."
echo "$0: Getting subsets of validation examples for diagnostics and combination."
if $generate_egs_scp; then
valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp"
train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp"
Expand All @@ -365,7 +368,6 @@ if [ $stage -le 2 ]; then
$cmd $dir/log/create_train_subset_diagnostic.log \
nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \
$train_diagnostic_output || exit 1
wait
sleep 5 # wait for file system to sync.
if $generate_egs_scp; then
cat $dir/valid_combine.cegs $dir/train_combine.cegs | \
Expand All @@ -375,7 +377,7 @@ if [ $stage -le 2 ]; then
fi

for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do
[ ! -s $f ] && echo "No examples in file $f" && exit 1;
[ ! -s $f ] && echo "$0: No examples in file $f" && exit 1;
done
rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs
) || touch $dir/.error &
Expand Down Expand Up @@ -412,7 +414,7 @@ if [ $stage -le 4 ]; then
fi

if [ -f $dir/.error ]; then
echo "Error detected while creating train/valid egs" && exit 1
echo "$0: Error detected while creating train/valid egs" && exit 1
fi

if [ $stage -le 5 ]; then
Expand Down Expand Up @@ -485,11 +487,11 @@ fi

wait
if [ -f $dir/.error ]; then
echo "Error detected while creating train/valid egs" && exit 1
echo "$0: Error detected while creating train/valid egs" && exit 1
fi

if [ $stage -le 6 ]; then
echo "$0: removing temporary archives"
echo "$0: Removing temporary archives, alignments and lattices"
(
cd $dir
for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done
Expand All @@ -501,7 +503,6 @@ if [ $stage -le 6 ]; then
# there are some extra soft links that we should delete.
for f in $dir/cegs.*.*.ark; do rm $f; done
fi
echo "$0: removing temporary alignments, lattices and transforms"
rm $dir/ali.{ark,scp} 2>/dev/null
rm $dir/lat_special.*.{ark,scp} 2>/dev/null
fi
Expand Down