-
Notifications
You must be signed in to change notification settings - Fork 1
/
smt-translator.sh
66 lines (58 loc) · 3.18 KB
/
smt-translator.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
## Make sure you follow the build steps first!
## tokenize the english corpus
./mosesdecoder/scripts/tokenizer/tokenizer.perl -l en
< ./corpous/proc.en # take proc.en corpus as a file
> ./corpus/proc.tok.en; # write the tokenized output to proc.tok.en file
## tokenize the amharic corpus
./mosesdecoder/scripts/tokenizer/tokenizer.perl -l ah
< ./corpous/proc.am # take proc.en corpus as a file
> ./corpus/proc.tok.am ;# write the tokenized output to proc.tok.en file
## training the caser for English corpus
sudo ./mosesdecoder/scripts/recaser/train-truecaser.perl
--model ./corpous/truecase-model.en # output file for the english case model
--corpus ./corpous/proc.tok.en # input for the case learner
## training the caser for Amharic corpus
sudo ./mosesdecoder/scripts/recaser/train-truecaser.perl
--model ./corpous/truecase-model.ah # output file for amharic case model(redundnat)
--corpus ./corpous/proc.tok.ah; # inputfile for the case learner
## casing the english corpus
sudo ./mosesdecoder/scripts/recaser/truecase.perl
--model ./corpous/truecase-model.en # model config file
< ./corpous/proc.tok.en # input for the caser
> ./corpous/proc.true.en # output file after casing the corpus
## casing the amharic language
sudo ./mosesdecoder/scripts/recaser/truecase.perl
--model ./corpous/truecase-model.ah # model config file
< ./corpous/proc.tok.ah # input file for the caser
> ./corpous/proc.true.ah; # output file after casing the corpus
## cleaning both corpus
sudo ./mosesdecoder/scripts/training/clean-corpus-n.perl
./corpous/proc.true ah en # take both cased files as an input
./corpous/proc.clean 1 80; # limit sentence lenght to 80
## build the language model
/mosesdecoder/bin/lmplz -o 3
< ./corpous/proc.true.en # turecased corpus input file
> ./corpous/proc.arpa.en; # the target langauge model(i.e English in our case)
# Then we should binarise (for faster loading) the *.arpa.en file using KenLM:
sudo ./mosesdecoder/bin/build_binary
./corpous/proc.arpa.en # the input file to be binarise
./corpous/proc.blm.en; # the binary file of the language model
# testing the language model
echo "የመንግሥት ሠራተኛው እንዲያውቀው ያልተደረገ ወይም ያልተገለጸለትን የጽሁፍ ማስረጃ በግል ማህደሩ ውስጥ ማስቀመጥ ክልክል ነው፡፡" |
./bin/query # the echo as an input to the qurey script file
./corpous/proc.blm.en; # the binary langauge modle file
# Training the translation model
## the following command is executed in the working/ directory
cd ./working/
nohup nice # run the follwoing command in the background
../scripts/training/train-model.perl # the training script
-root-dir train # output dir
-corpus ../corpous/proc.clean -f ah -e en # the cleaned corpus files
-alignment grow-diag-final-and # word alignment
-reordering msd-bidirectional-fe # lexicalized reordering
-lm 0:3:$HOME/smt/corpous/proc.blm.en:8 # binary file of the language model
-external-bin-dir ../tools # dir where GIAZ++,mkcsl and sn2cooc.out are located
>& training.out & ;
# Testing the translation model
echo "Ener an amharic string after it starts"
sudo ./bin/moses -f ./working/train/model/moses.ini