Skip to content

Commit 7b94d69

Browse files
committed
Prepro for ASR
1 parent 52581ef commit 7b94d69

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

scripts/monoTranslationData/Translate.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ echo -n "" > /data/${name}/eval/$set.s
2525
while read -r line
2626
do
2727
#clean ctm; sort by time; extract words; remove puncutation; lower-case
28-
sed -e "s/([0-9]*)//g" $line | sed -e '/\$(.*)/d' | sort -g -k 3,3 | awk '{if($1 != "#") {printf("%s ",$5)}}END{print ""}' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' | perl -nle 'print lc' > /tmp/$name/$set.$i.np
28+
sed -e "s/([0-9]*)//g" $line | sed -e '/\$(.*)/d' | sort -g -k 3,3 | awk '{if($1 != "#") {printf("%s ",$5)}}END{print ""}' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} | perl -nle 'print lc' > /tmp/$name/$set.$i.np
2929
/opt/SLT.KIT/scripts/monoTranslationData/ConCat10.pl /tmp/$name/$set.$i.np 10 > /tmp/$name/$set.$i.np.concat
3030
cat /tmp/$name/$set.$i.np.concat | /opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc --vocabulary-threshold 50 > /data/${name}/eval/$set.$i.s
3131
/opt/SLT.KIT/scripts/openNMT-py/Translate.sh $set.$i $name $translator $model

0 commit comments

Comments
 (0)