diff --git a/egs/aishell/v1/local/aishell_data_prep.sh b/egs/aishell/v1/local/aishell_data_prep.sh index 70d6ba1f3e5..11d131dcdb1 100755 --- a/egs/aishell/v1/local/aishell_data_prep.sh +++ b/egs/aishell/v1/local/aishell_data_prep.sh @@ -40,13 +40,11 @@ n=`cat $train_dir/wav.flist $dev_dir/wav.flist $test_dir/wav.flist | wc -l` # Transcriptions preparation for dir in $train_dir $test_dir; do echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' |\ - sort > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' |\ - sort > $dir/utt2spk_all + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text_dir/*.txt > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + awk '{print $1}' $dir/transcripts.txt | sort -u > $dir/utt.list utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp sort -u $dir/transcripts.txt > $dir/text