forked from autosimtrans/SimulTransBaseline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_data.sh
91 lines (71 loc) · 3.2 KB
/
get_data.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#! /usr/bin/env bash
set -e
OUTPUT_DIR=$PWD/data
OUTPUT_DIR_DATA="${OUTPUT_DIR}/wmt18_zhen_data"
ORIGIN_DIR_DATA="${OUTPUT_DIR_DATA}/corpus"
OUTPUT_DIR_BPE_DATA="${OUTPUT_DIR}/wmt18_zhen_data_bpe"
LANG1="zh"
LANG2="en"
mkdir -p $OUTPUT_DIR_DATA $OUTPUT_DIR_BPE_DATA
if [ ! -e ${ORIGIN_DIR_DATA} ]; then
echo "Download WMT18 Preprocessed Training Data"
wget -O ${OUTPUT_DIR_DATA}/corpus.gz http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/corpus.gz
gunzip ${OUTPUT_DIR_DATA}/corpus.gz
awk -F"\\t" '{print $1}' ${ORIGIN_DIR_DATA} > ${ORIGIN_DIR_DATA}.${LANG1}
awk -F"\\t" '{print $2}' ${ORIGIN_DIR_DATA} > ${ORIGIN_DIR_DATA}.${LANG2}
fi
if [ ! -e ${OUTPUT_DIR_DATA}/newsdev2017.tc.en ]; then
echo "Download WMT18 Preprocessed Dev Data"
wget -O ${OUTPUT_DIR_DATA}/dev.tgz http://data.statmt.org/wmt18/translation-task/preprocessed/zh-en/dev.tgz
tar zxvf ${OUTPUT_DIR_DATA}/dev.tgz -C $OUTPUT_DIR_DATA/
fi
# Clone mosesdecoder
if [ ! -d ${OUTPUT_DIR}/mosesdecoder ]; then
echo "Cloning moses for data processing"
git clone https://github.com/moses-smt/mosesdecoder.git ${OUTPUT_DIR}/mosesdecoder
fi
# Clone subword-nmt and generate BPE data
if [ ! -d ${OUTPUT_DIR}/subword-nmt ]; then
git clone https://github.com/rsennrich/subword-nmt.git ${OUTPUT_DIR}/subword-nmt
fi
# Generate BPE data and vocabulary
num_operations=16000
if [ ! -e ${OUTPUT_DIR_BPE_DATA}/bpe.en.${num_operations} ]; then
echo "Learn BPE with ${num_operations} merge operations"
cat ${ORIGIN_DIR_DATA}.${LANG1} | \
${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $num_operations > ${OUTPUT_DIR_BPE_DATA}/bpe.${LANG1}.${num_operations}
cat ${ORIGIN_DIR_DATA}.${LANG2} | \
${OUTPUT_DIR}/subword-nmt/learn_bpe.py -s $num_operations > ${OUTPUT_DIR_BPE_DATA}/bpe.${LANG2}.${num_operations}
fi
for l in ${LANG1} ${LANG2}; do
for f in `ls ${OUTPUT_DIR_DATA}/*.$l`; do
f_base=${f%.*} # dir/train.tok dir/train.tok.clean dir/newstest2016.tok
f_base=${f_base##*/} # train.tok train.tok.clean newstest2016.tok
f_out=${OUTPUT_DIR_BPE_DATA}/${f_base}.bpe.${num_operations}.$l
if [ ! -e $f_out ]; then
echo "Apply BPE to "$f
${OUTPUT_DIR}/subword-nmt/apply_bpe.py -c ${OUTPUT_DIR_BPE_DATA}/bpe.$l.${num_operations} < $f > $f_out
fi
done
done
for l in ${LANG1} ${LANG2}; do
if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab.${l}.bpe.${num_operations} ]; then
echo "Create vocabulary for BPE data"
cat ${OUTPUT_DIR_BPE_DATA}/corpus.bpe.${num_operations}.${l} | \
${OUTPUT_DIR}/subword-nmt/get_vocab.py | cut -f1 -d ' ' > ${OUTPUT_DIR_BPE_DATA}/vocab.${l}.bpe.${num_operations}
fi
done
# Adapt to the reader
for f in ${OUTPUT_DIR_BPE_DATA}/*.bpe.${num_operations}.${LANG1}; do
f_base=${f%.*} # dir/train.tok.clean.bpe.32000 dir/newstest2016.tok.bpe.32000
f_out=${f_base}.${LANG1}-${LANG2}
if [ ! -e $f_out ]; then
paste -d '\t' $f_base.${LANG1} $f_base.${LANG2} > $f_out
fi
done
for l in ${LANG1} ${LANG2}; do
if [ ! -e ${OUTPUT_DIR_BPE_DATA}/vocab_all.${l}.bpe.${num_operations} ]; then
sed '1i\<s>\n<e>\n<unk>' ${OUTPUT_DIR_BPE_DATA}/vocab.${l}.bpe.${num_operations} > ${OUTPUT_DIR_BPE_DATA}/vocab_all.${l}.bpe.${num_operations}
fi
done
echo "All done."