diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 9996820d6d3..ae4a0474a24 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -151,36 +151,40 @@ mkdir -p $dir/log $dir/info # Get list of validation utterances. frame_shift=$(utils/data/get_frame_shift.sh $data) || exit 1 +if [ -f $data/utt2uniq ]; then + # Must hold out all augmented versions of the same utterance. + echo "$0: File $data/utt2uniq exists, so ensuring the hold-out set" \ + "includes all perturbed versions of the same source utterance." + utils/utt2spk_to_spk2utt.pl $data/utt2uniq 2>/dev/null | + awk -v max_utt=$num_utts_subset '{ + for (n=2;n<=NF;n++) print $n; + printed += NF-1; + if (printed >= max_utt) nextfile; }' | + sort > $dir/valid_uttlist +else + awk '{print $1}' $data/utt2spk | \ + utils/shuffle_list.pl 2>/dev/null | \ + head -$num_utts_subset > $dir/valid_uttlist +fi +len_valid_uttlist=$(wc -l < $dir/valid_uttlist) + awk '{print $1}' $data/utt2spk | \ - utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/valid_uttlist + utils/filter_scp.pl --exclude $dir/valid_uttlist | \ + utils/shuffle_list.pl 2>/dev/null | \ + head -$num_utts_subset > $dir/train_subset_uttlist +len_trainsub_uttlist=$(wc -l <$dir/train_subset_uttlist) -len_uttlist=$(wc -l < $dir/valid_uttlist) -if [ $len_uttlist -lt $num_utts_subset ]; then - echo "Number of utterances is very small. Please check your data." && exit 1; +if [[ $len_valid_uttlist -lt $num_utts_subset || + $len_trainsub_uttlist -lt $num_utts_subset ]]; then + echo "$0: Number of utterances is very small. Please check your data." && exit 1; fi -if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. - # because of this stage we can again have utts with lengths less than - # frames_per_eg - echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" - echo "include all perturbed versions of the same 'real' utterances." - mv $dir/valid_uttlist $dir/valid_uttlist.tmp - utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $dir/uniq2utt - cat $dir/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ - sort | uniq | utils/apply_map.pl $dir/uniq2utt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $dir/valid_uttlist - rm $dir/uniq2utt $dir/valid_uttlist.tmp -fi +echo "$0: Holding out $len_valid_uttlist utterances in validation set and" \ + "$len_trainsub_uttlist in training diagnostic set, out of total" \ + "$(wc -l < $data/utt2spk)." -echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" -awk '{print $1}' $data/utt2spk | \ - utils/filter_scp.pl --exclude $dir/valid_uttlist | \ - utils/shuffle_list.pl 2>/dev/null | head -$num_utts_subset > $dir/train_subset_uttlist -len_uttlist=$(wc -l <$dir/train_subset_uttlist) -if [ $len_uttlist -lt $num_utts_subset ]; then - echo "Number of utterances is very small. Please check your data." && exit 1; -fi +echo "$0: creating egs. To ensure they are not deleted later you can do: touch $dir/.nodelete" ## Set up features. echo "$0: feature type is raw" @@ -342,9 +346,8 @@ if [ $stage -le 2 ]; then $egs_opts --normalization-fst-scale=$normalization_fst_scale \ $trans_mdl_opt $chaindir/normalization.fst \ "$train_subset_feats" ark,s,cs:- "ark:$dir/train_subset_all.cegs" || exit 1 - wait sleep 5 # wait for file system to sync. - echo "... Getting subsets of validation examples for diagnostics and combination." + echo "$0: Getting subsets of validation examples for diagnostics and combination." if $generate_egs_scp; then valid_diagnostic_output="ark,scp:$dir/valid_diagnostic.cegs,$dir/valid_diagnostic.scp" train_diagnostic_output="ark,scp:$dir/train_diagnostic.cegs,$dir/train_diagnostic.scp" @@ -365,7 +368,6 @@ if [ $stage -le 2 ]; then $cmd $dir/log/create_train_subset_diagnostic.log \ nnet3-chain-subset-egs --n=$num_egs_diagnostic ark:$dir/train_subset_all.cegs \ $train_diagnostic_output || exit 1 - wait sleep 5 # wait for file system to sync. if $generate_egs_scp; then cat $dir/valid_combine.cegs $dir/train_combine.cegs | \ @@ -375,7 +377,7 @@ if [ $stage -le 2 ]; then fi for f in $dir/{combine,train_diagnostic,valid_diagnostic}.cegs; do - [ ! -s $f ] && echo "No examples in file $f" && exit 1; + [ ! -s $f ] && echo "$0: No examples in file $f" && exit 1; done rm $dir/valid_all.cegs $dir/train_subset_all.cegs $dir/{train,valid}_combine.cegs ) || touch $dir/.error & @@ -412,7 +414,7 @@ if [ $stage -le 4 ]; then fi if [ -f $dir/.error ]; then - echo "Error detected while creating train/valid egs" && exit 1 + echo "$0: Error detected while creating train/valid egs" && exit 1 fi if [ $stage -le 5 ]; then @@ -485,11 +487,11 @@ fi wait if [ -f $dir/.error ]; then - echo "Error detected while creating train/valid egs" && exit 1 + echo "$0: Error detected while creating train/valid egs" && exit 1 fi if [ $stage -le 6 ]; then - echo "$0: removing temporary archives" + echo "$0: Removing temporary archives, alignments and lattices" ( cd $dir for f in $(ls -l . | grep 'cegs_orig' | awk '{ X=NF-1; Y=NF-2; if ($X == "->") print $Y, $NF; }'); do rm $f; done @@ -501,7 +503,6 @@ if [ $stage -le 6 ]; then # there are some extra soft links that we should delete. for f in $dir/cegs.*.*.ark; do rm $f; done fi - echo "$0: removing temporary alignments, lattices and transforms" rm $dir/ali.{ark,scp} 2>/dev/null rm $dir/lat_special.*.{ark,scp} 2>/dev/null fi