Skip to content

Commit 4f441b0

Browse files
committed
Update preprocessing
1 parent 8693e7e commit 4f441b0

File tree

3 files changed

+59
-46
lines changed

3 files changed

+59
-46
lines changed

scripts/dataCollection/IWSLT.2017.sh

100644100755
+8-7
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,20 @@ cd /tmp/corpus
55
wget https://wit3.fbk.eu/archive/2017-01-trnted//texts/${sl}/${tl}/${sl}-${tl}.tgz
66
tar -xzvf ${sl}-${tl}.tgz
77

8-
mkdir -p /data/parallel/
8+
mkdir -p /data/orig/parallel/
99

1010
#KEEP only lines, where there is no xml in source or target
1111
paste ${sl}-${tl}/train.tags.${sl}-${tl}.${sl} ${sl}-${tl}/train.tags.${sl}-${tl}.${tl} | awk '{if($1 ~ /^</ && $NF ~ />\s*$/) {print "REMOVE"}else{print "KEEP";}}' > ${sl}-${tl}/train.tags.lines
12-
paste ${sl}-${tl}/train.tags.lines ${sl}-${tl}/train.tags.${sl}-${tl}.${sl} | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/parallel/TED.s
13-
paste ${sl}-${tl}/train.tags.lines ${sl}-${tl}/train.tags.${sl}-${tl}.${tl} | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/parallel/TED.t
12+
paste ${sl}-${tl}/train.tags.lines ${sl}-${tl}/train.tags.${sl}-${tl}.${sl} | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/orig/parallel/TED.s
13+
paste ${sl}-${tl}/train.tags.lines ${sl}-${tl}/train.tags.${sl}-${tl}.${tl} | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/orig/parallel/TED.t
1414

15-
mkdir -p /data/valid/
15+
mkdir -p /data/orig/valid/
1616

1717

18-
paste ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${sl}.xml ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${tl}.xml | awk '{if($1 ~ /^</ && $NF ~ />\s*$/) {print "REMOVE"}else{print "KEEP";}}' > ${sl}-${tl}/IWSLT17.TED.tst2014.lines
19-
paste ${sl}-${tl}/IWSLT17.TED.tst2014.lines ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${sl}.xml | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/valid/TED.tst2014.s
20-
paste ${sl}-${tl}/IWSLT17.TED.tst2014.lines ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${tl}.xml | awk '{if($1 == "KEEP"){$1="";print $0}}' | sed -e "s/^\s*//g" > /data/valid/TED.tst2014.t
18+
19+
grep "<seg" ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${sl}.xml | sed -e "s/<[^>]*>//g" | sed -e "s/^\s*//g" > /data/orig/valid/TED.tst2014.s
20+
grep "<seg" ${sl}-${tl}/IWSLT17.TED.tst2014.${sl}-${tl}.${tl}.xml | sed -e "s/<[^>]*>//g" | sed -e "s/^\s*//g" > /data/orig/valid/TED.tst2014.t
21+
2122

2223
cd /
2324
rm -r /tmp/corpus/

scripts/defaultPreprocessor/Train.sh

+48-38
Original file line numberDiff line numberDiff line change
@@ -1,92 +1,102 @@
11
#!/bin/bash
22

33

4-
mkdir -p /tmp/defaultPreprocessor/tok/train
5-
mkdir -p /tmp/defaultPreprocessor/tok/valid
6-
mkdir -p /tmp/defaultPreprocessor/sc/train
7-
mkdir -p /tmp/defaultPreprocessor/sc/valid
8-
mkdir -p /data/defaultPreprocessor/model
4+
input=$1
5+
name=$2
96

7+
mkdir -p /tmp/${name}/tok/train
8+
mkdir -p /tmp/${name}/tok/valid
9+
mkdir -p /tmp/${name}/sc/train
10+
mkdir -p /tmp/${name}/sc/valid
11+
mkdir -p /model/${name}
12+
mkdir -p /data/${name}/train
13+
mkdir -p /data/${name}/valid
1014

1115
##TOKENIZE
1216

13-
echo "" > /tmp/defaultPreprocessor/corpus.tok.s
14-
for f in /data/parallel/*\.s
17+
echo "" > /tmp/${name}/corpus.tok.s
18+
for f in /data/${input}/parallel/*\.s
1519
do
16-
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/train/${f##*/}
17-
cat /tmp/defaultPreprocessor/tok/train/${f##*/} >> /tmp/defaultPreprocessor/corpus.tok.s
20+
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/${name}/tok/train/${f##*/}
21+
cat /tmp/${name}/tok/train/${f##*/} >> /tmp/${name}/corpus.tok.s
1822
done
19-
for f in /data/valid/*\.s
23+
for f in /data/${input}/valid/*\.s
2024
do
21-
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/valid/${f##*/}
25+
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/${name}/tok/valid/${f##*/}
2226
done
2327

2428

2529

26-
echo "" > /tmp/defaultPreprocessor/corpus.tok.t
27-
for f in /data/parallel/*\.t
30+
echo "" > /tmp/${name}/corpus.tok.t
31+
for f in /data/${input}/parallel/*\.t
2832
do
29-
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/train/${f##*/}
30-
cat /tmp/defaultPreprocessor/tok/train/${f##*/} >> /tmp/defaultPreprocessor/corpus.tok.t
33+
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${tl} > /tmp/${name}/tok/train/${f##*/}
34+
cat /tmp/${name}/tok/train/${f##*/} >> /tmp/${name}/corpus.tok.t
3135
done
32-
for f in /data/valid/*\.t
36+
for f in /data/${input}/valid/*\.t
3337
do
34-
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/valid/${f##*/}
38+
cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${tl} > /tmp/${name}/tok/valid/${f##*/}
3539
done
3640

3741

3842

3943
##SMARTCASE
4044

4145

42-
/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /data/defaultPreprocessor/model/truecase-model.s --corpus /tmp/defaultPreprocessor/corpus.tok.s
43-
/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /data/defaultPreprocessor/model/truecase-model.t --corpus /tmp/defaultPreprocessor/corpus.tok.t
46+
/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /model/${name}/truecase-model.s --corpus /tmp/${name}/corpus.tok.s
47+
/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /model/${name}/truecase-model.t --corpus /tmp/${name}/corpus.tok.t
4448

45-
for set in dev train
49+
for set in valid train
4650
do
47-
for f in /tmp/defaultPreprocessor/tok/$set/*\.s
51+
for f in /tmp/${name}/tok/$set/*\.s
4852
do
49-
cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /tmp/defaultPreprocessor/corpus.tok.s > /tmp/defaultPreprocessor/sc/$set/${f##*/}
53+
cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /model/${name}/truecase-model.s > /tmp/${name}/sc/$set/${f##*/}
5054
done
5155
done
5256

53-
for set in dev train
57+
for set in valid train
5458
do
55-
for f in /tmp/defaultPreprocessor/tok/$set/*\.t
59+
for f in /tmp/${name}/tok/$set/*\.t
5660
do
57-
cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /tmp/defaultPreprocessor/corpus.tok.t > /tmp/defaultPreprocessor/sc/$set/${f##*/}
61+
cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /model/${name}/truecase-model.t > /tmp/${name}/sc/$set/${f##*/}
5862
done
5963
done
6064

61-
echo "" > /tmp/defaultPreprocessor/corpus.sc.s
62-
for f in /tmp/defaultPreprocessor/sc/train/*\.s
65+
echo "" > /tmp/${name}/corpus.sc.s
66+
for f in /tmp/${name}/sc/train/*\.s
6367
do
64-
cat $f >> /tmp/defaultPreprocessor/corpus.sc.s
68+
cat $f >> /tmp/${name}/corpus.sc.s
6569
done
6670

67-
echo "" > /tmp/defaultPreprocessor/corpus.sc.t
68-
for f in /tmp/defaultPreprocessor/sc/train/*\.t
71+
echo "" > /tmp/${name}/corpus.sc.t
72+
for f in /tmp/${name}/sc/train/*\.t
6973
do
70-
cat $f >> /tmp/defaultPreprocessor/corpus.sc.t
74+
cat $f >> /tmp/${name}/corpus.sc.t
7175
done
7276

7377
##BPE
7478

7579

76-
/opt/subword-nmt/learn_joint_bpe_and_vocab.py --input /tmp/defaultPreprocessor/corpus.sc.s /tmp/defaultPreprocessor/corpus.sc.t -s 40000 -o /data/defaultPreprocessor/model/codec --write-vocabulary /data/defaultPreprocessor/model/voc.s /data/defaultPreprocessor/model/voc.t
80+
/opt/subword-nmt/learn_joint_bpe_and_vocab.py --input /tmp/${name}/corpus.sc.s /tmp/${name}/corpus.sc.t -s 40000 -o /model/${name}/codec --write-vocabulary /model/${name}/voc.s /model/${name}/voc.t
7781

7882

79-
for set in dev train
83+
for set in valid train
8084
do
81-
for f in /tmp/defaultPreprocessor/tok/$set/*\.s
85+
for f in /tmp/${name}/tok/$set/*\.s
8286
do
83-
/opt/subword-nmt/apply_bpe.py -c /data/defaultPreprocessor/model/codec --vocabulary /data/defaultPreprocessor/model/voc.s --vocabulary-threshold 50 < $f > /data/defaultPreprocessor/$set/${f##*/}
87+
echo $f
88+
/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc.s --vocabulary-threshold 50 < $f > /data/${name}/$set/${f##*/}
89+
done
8490
done
8591

86-
for set in dev train
92+
for set in valid train
8793
do
88-
for f in /tmp/defaultPreprocessor/tok/$set/*\.t
94+
for f in /tmp/${name}/tok/$set/*\.t
8995
do
90-
/opt/subword-nmt/apply_bpe.py -c /data/defaultPreprocessor/model/codec --vocabulary /data/defaultPreprocessor/model/voc.t --vocabulary-threshold 50 < $f > /data/defaultPreprocessor/$set/${f##*/}
96+
echo $f
97+
/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc.t --vocabulary-threshold 50 < $f > /data/${name}/$set/${f##*/}
98+
done
9199
done
92100

101+
102+
rm -r /tmp/${name}/

systems/smallTED/Train.sh

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,7 @@
55

66

77
#Preprocess Data
8-
/opt/SLT.KIT/scripts/defaultPreprocessor/Train.sh
8+
/opt/SLT.KIT/scripts/defaultPreprocessor/Train.sh orig prepro
9+
10+
911

0 commit comments

Comments
 (0)