|
1 | 1 | #!/bin/bash
|
2 | 2 |
|
3 | 3 |
|
4 |
| -mkdir -p /tmp/defaultPreprocessor/tok/train |
5 |
| -mkdir -p /tmp/defaultPreprocessor/tok/valid |
6 |
| -mkdir -p /tmp/defaultPreprocessor/sc/train |
7 |
| -mkdir -p /tmp/defaultPreprocessor/sc/valid |
8 |
| -mkdir -p /data/defaultPreprocessor/model |
| 4 | +input=$1 |
| 5 | +name=$2 |
9 | 6 |
|
| 7 | +mkdir -p /tmp/${name}/tok/train |
| 8 | +mkdir -p /tmp/${name}/tok/valid |
| 9 | +mkdir -p /tmp/${name}/sc/train |
| 10 | +mkdir -p /tmp/${name}/sc/valid |
| 11 | +mkdir -p /model/${name} |
| 12 | +mkdir -p /data/${name}/train |
| 13 | +mkdir -p /data/${name}/valid |
10 | 14 |
|
11 | 15 | ##TOKENIZE
|
12 | 16 |
|
13 |
| -echo "" > /tmp/defaultPreprocessor/corpus.tok.s |
14 |
| -for f in /data/parallel/*\.s |
| 17 | +echo "" > /tmp/${name}/corpus.tok.s |
| 18 | +for f in /data/${input}/parallel/*\.s |
15 | 19 | do
|
16 |
| -cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/train/${f##*/} |
17 |
| -cat /tmp/defaultPreprocessor/tok/train/${f##*/} >> /tmp/defaultPreprocessor/corpus.tok.s |
| 20 | +cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/${name}/tok/train/${f##*/} |
| 21 | +cat /tmp/${name}/tok/train/${f##*/} >> /tmp/${name}/corpus.tok.s |
18 | 22 | done
|
19 |
| -for f in /data/valid/*\.s |
| 23 | +for f in /data/${input}/valid/*\.s |
20 | 24 | do
|
21 |
| -cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/valid/${f##*/} |
| 25 | +cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/${name}/tok/valid/${f##*/} |
22 | 26 | done
|
23 | 27 |
|
24 | 28 |
|
25 | 29 |
|
26 |
| -echo "" > /tmp/defaultPreprocessor/corpus.tok.t |
27 |
| -for f in /data/parallel/*\.t |
| 30 | +echo "" > /tmp/${name}/corpus.tok.t |
| 31 | +for f in /data/${input}/parallel/*\.t |
28 | 32 | do
|
29 |
| -cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/train/${f##*/} |
30 |
| -cat /tmp/defaultPreprocessor/tok/train/${f##*/} >> /tmp/defaultPreprocessor/corpus.tok.t |
| 33 | +cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${tl} > /tmp/${name}/tok/train/${f##*/} |
| 34 | +cat /tmp/${name}/tok/train/${f##*/} >> /tmp/${name}/corpus.tok.t |
31 | 35 | done
|
32 |
| -for f in /data/valid/*\.t |
| 36 | +for f in /data/${input}/valid/*\.t |
33 | 37 | do
|
34 |
| -cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${sl} > /tmp/defaultPreprocessor/tok/valid/${f##*/} |
| 38 | +cat $f | perl /opt/mosesdecoder/scripts/tokenizer/tokenizer.perl -l ${tl} > /tmp/${name}/tok/valid/${f##*/} |
35 | 39 | done
|
36 | 40 |
|
37 | 41 |
|
38 | 42 |
|
39 | 43 | ##SMARTCASE
|
40 | 44 |
|
41 | 45 |
|
42 |
| -/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /data/defaultPreprocessor/model/truecase-model.s --corpus /tmp/defaultPreprocessor/corpus.tok.s |
43 |
| -/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /data/defaultPreprocessor/model/truecase-model.t --corpus /tmp/defaultPreprocessor/corpus.tok.t |
| 46 | +/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /model/${name}/truecase-model.s --corpus /tmp/${name}/corpus.tok.s |
| 47 | +/opt/mosesdecoder/scripts/recaser/train-truecaser.perl --model /model/${name}/truecase-model.t --corpus /tmp/${name}/corpus.tok.t |
44 | 48 |
|
45 |
| -for set in dev train |
| 49 | +for set in valid train |
46 | 50 | do
|
47 |
| -for f in /tmp/defaultPreprocessor/tok/$set/*\.s |
| 51 | +for f in /tmp/${name}/tok/$set/*\.s |
48 | 52 | do
|
49 |
| -cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /tmp/defaultPreprocessor/corpus.tok.s > /tmp/defaultPreprocessor/sc/$set/${f##*/} |
| 53 | +cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /model/${name}/truecase-model.s > /tmp/${name}/sc/$set/${f##*/} |
50 | 54 | done
|
51 | 55 | done
|
52 | 56 |
|
53 |
| -for set in dev train |
| 57 | +for set in valid train |
54 | 58 | do
|
55 |
| -for f in /tmp/defaultPreprocessor/tok/$set/*\.t |
| 59 | +for f in /tmp/${name}/tok/$set/*\.t |
56 | 60 | do
|
57 |
| -cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /tmp/defaultPreprocessor/corpus.tok.t > /tmp/defaultPreprocessor/sc/$set/${f##*/} |
| 61 | +cat $f | /opt/mosesdecoder/scripts/recaser/truecase.perl --model /model/${name}/truecase-model.t > /tmp/${name}/sc/$set/${f##*/} |
58 | 62 | done
|
59 | 63 | done
|
60 | 64 |
|
61 |
| -echo "" > /tmp/defaultPreprocessor/corpus.sc.s |
62 |
| -for f in /tmp/defaultPreprocessor/sc/train/*\.s |
| 65 | +echo "" > /tmp/${name}/corpus.sc.s |
| 66 | +for f in /tmp/${name}/sc/train/*\.s |
63 | 67 | do
|
64 |
| -cat $f >> /tmp/defaultPreprocessor/corpus.sc.s |
| 68 | +cat $f >> /tmp/${name}/corpus.sc.s |
65 | 69 | done
|
66 | 70 |
|
67 |
| -echo "" > /tmp/defaultPreprocessor/corpus.sc.t |
68 |
| -for f in /tmp/defaultPreprocessor/sc/train/*\.t |
| 71 | +echo "" > /tmp/${name}/corpus.sc.t |
| 72 | +for f in /tmp/${name}/sc/train/*\.t |
69 | 73 | do
|
70 |
| -cat $f >> /tmp/defaultPreprocessor/corpus.sc.t |
| 74 | +cat $f >> /tmp/${name}/corpus.sc.t |
71 | 75 | done
|
72 | 76 |
|
73 | 77 | ##BPE
|
74 | 78 |
|
75 | 79 |
|
76 |
| -/opt/subword-nmt/learn_joint_bpe_and_vocab.py --input /tmp/defaultPreprocessor/corpus.sc.s /tmp/defaultPreprocessor/corpus.sc.t -s 40000 -o /data/defaultPreprocessor/model/codec --write-vocabulary /data/defaultPreprocessor/model/voc.s /data/defaultPreprocessor/model/voc.t |
| 80 | +/opt/subword-nmt/learn_joint_bpe_and_vocab.py --input /tmp/${name}/corpus.sc.s /tmp/${name}/corpus.sc.t -s 40000 -o /model/${name}/codec --write-vocabulary /model/${name}/voc.s /model/${name}/voc.t |
77 | 81 |
|
78 | 82 |
|
79 |
| -for set in dev train |
| 83 | +for set in valid train |
80 | 84 | do
|
81 |
| -for f in /tmp/defaultPreprocessor/tok/$set/*\.s |
| 85 | +for f in /tmp/${name}/tok/$set/*\.s |
82 | 86 | do
|
83 |
| -/opt/subword-nmt/apply_bpe.py -c /data/defaultPreprocessor/model/codec --vocabulary /data/defaultPreprocessor/model/voc.s --vocabulary-threshold 50 < $f > /data/defaultPreprocessor/$set/${f##*/} |
| 87 | +echo $f |
| 88 | +/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc.s --vocabulary-threshold 50 < $f > /data/${name}/$set/${f##*/} |
| 89 | +done |
84 | 90 | done
|
85 | 91 |
|
86 |
| -for set in dev train |
| 92 | +for set in valid train |
87 | 93 | do
|
88 |
| -for f in /tmp/defaultPreprocessor/tok/$set/*\.t |
| 94 | +for f in /tmp/${name}/tok/$set/*\.t |
89 | 95 | do
|
90 |
| -/opt/subword-nmt/apply_bpe.py -c /data/defaultPreprocessor/model/codec --vocabulary /data/defaultPreprocessor/model/voc.t --vocabulary-threshold 50 < $f > /data/defaultPreprocessor/$set/${f##*/} |
| 96 | +echo $f |
| 97 | +/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc.t --vocabulary-threshold 50 < $f > /data/${name}/$set/${f##*/} |
| 98 | +done |
91 | 99 | done
|
92 | 100 |
|
| 101 | + |
| 102 | +rm -r /tmp/${name}/ |
0 commit comments