From 2203fcb56d1d16c897560ab3b2f23a5ced971f6e Mon Sep 17 00:00:00 2001 From: Dongji Gao Date: Thu, 11 Jul 2019 16:54:20 -0400 Subject: [PATCH] fixed bug in egs/gale_arabic/s5c/local/prepare_dict_subword.sh that it may delete words in the form of '<*>' --- egs/gale_arabic/s5c/local/prepare_dict_subword.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh index 330de664349..e05846ec593 100755 --- a/egs/gale_arabic/s5c/local/prepare_dict_subword.sh +++ b/egs/gale_arabic/s5c/local/prepare_dict_subword.sh @@ -48,7 +48,7 @@ glossaries=" " if [ $stage -le 0 ]; then echo "$0: making subword lexicon... $(date)." # get pair_code file - cut -d ' ' -f2- data/train/text | sed 's/<[^>]*>//g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt + cut -d ' ' -f2- data/train/text | sed 's///g;s///g' | utils/lang/bpe/learn_bpe.py -s $num_merges > data/local/pair_code.txt mv $dir/lexicon.txt $dir/lexicon_word.txt # get words cut -d ' ' -f1 $dir/lexicon_word.txt > $dir/words.txt