Skip to content

Commit 4c693fd

Browse files
committed
SLT Translation
1 parent ba680e9 commit 4c693fd

File tree

3 files changed

+81
-14
lines changed

3 files changed

+81
-14
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
import sys
5+
6+
7+
sentenceEnd=[".","!","?"]
8+
9+
def add(s_filename,t_filename,context):
10+
11+
s_f = open(s_filename)
12+
t_f = open(t_filename)
13+
14+
s = s_f.readline()
15+
t = t_f.readline()
16+
17+
stat = [({},{}) for i in range(context)]
18+
19+
while(s and t):
20+
w_s = s.strip().split();
21+
22+
w_t = t.strip().split();
23+
24+
for i in range(min(context,len(w_t))):
25+
if(w_t[i][0] in stat[i][0]):
26+
stat[i][0][w_t[i][0]] += 1;
27+
else:
28+
stat[i][0][w_t[i][0]] = 1
29+
if(len(w_t[i]) > 1):
30+
if(w_t[i][1:] in stat[i][1]):
31+
stat[i][1][w_t[i][1:]] += 1;
32+
else:
33+
stat[i][1][w_t[i][1:]] = 1
34+
else:
35+
if(w_t[i][1:] in stat[i][1]):
36+
stat[i][1][""] += 1;
37+
else:
38+
stat[i][1][""] = 1
39+
40+
if(not 'L' in stat[0][0]):
41+
print w_s[0].title(),
42+
elif(not 'U' in stat[0][0] or stat[0][0]['L'] > stat[0][0]['U']):
43+
print w_s[0],
44+
else:
45+
print w_s[0].title(),
46+
47+
maxPunc = ""
48+
maxCount = 0
49+
for k in stat[0][1].keys():
50+
if(stat[0][1][k] > maxCount):
51+
maxPunc=k
52+
maxCount = stat[0][1][k]
53+
#print stat[0],maxPunc
54+
if(maxPunc != ""):
55+
print maxPunc,
56+
if any(p in maxPunc for p in sentenceEnd):
57+
print "";
58+
59+
s = s_f.readline()
60+
t = t_f.readline()
61+
stat = stat[1:]+[({},{})]
62+
63+
64+
65+
def main():
66+
args=sys.argv[1:];
67+
if(len(args) == 3):
68+
add(args[0],args[1],int(args[2]));
69+
else:
70+
raise SystemExit("Usage: python "+sys.argv[0]+" source U/LFile context")
71+
72+
73+
main();

scripts/monoTranslationData/Translate.sh

+5-2
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,18 @@ mkdir -p /tmp/$name
1414
mkdir -p /data/${name}/eval/
1515

1616
cd /data/$input/eval/$set/IWSLT.$set/
17-
echo -n "" > /tmp/$name/$set.np
1817
i=0
18+
echo -n "" > /data/${name}/eval/$set.s
1919
while read -r line
2020
do
21-
sed -e "s/([0-9]*)//g" $line | sed -e '/\$(.*)/d' | awk '{if($1 != "#") {printf("%s ",$5)}}END{print ""}' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' | perl -nle 'print lc' >> /tmp/$name/$set.$i.np
21+
sed -e "s/([0-9]*)//g" $line | sed -e '/\$(.*)/d' | awk '{if($1 != "#") {printf("%s ",$5)}}END{print ""}' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' | perl -nle 'print lc' > /tmp/$name/$set.$i.np
2222
/opt/SLT.KIT/scripts/monoTranslationData/ConCat10.pl /tmp/$name/$set.$i.np 10 > /tmp/$name/$set.$i.np.concat
2323
cat /tmp/$name/$set.$i.np.concat | /opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /model/${name}/voc --vocabulary-threshold 50 > /data/${name}/eval/$set.$i.s
2424
/opt/SLT.KIT/scripts/openNMT-py/Translate.sh $set.$i $name $translator
25+
python /opt/SLT.KIT/scripts/monoTranslationData/AddPunctuation.py /tmp/$name/$set.$i.np.concat /data/$translator/eval/$set.$i.t 10 >> /data/${name}/eval/$set.s
2526
((i++))
2627
done < /data/$input/eval/$set/IWSLT.$set/CTM_LIST
2728

29+
rm -r /tmp/$name
30+
2831

systems/smallTED/Translate.sh

+3-12
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,8 @@ if [ ! -e /data/orig/eval/$set ]; then
1111

1212
fi
1313

14+
#Add puncuation
1415
/opt/SLT.KIT/scripts/monoTranslationData/Translate.sh $set orig monoTransPrepro monTrans
1516

16-
#Preprocess Data
17-
#/opt/SLT.KIT/scripts/defaultPreprocessor/Train.sh orig prepro
18-
19-
20-
#Train NMT
21-
#/opt/SLT.KIT/scripts/openNMT-py/Train.sh prepro mt
22-
23-
24-
#Preprocess for Puncutation
25-
26-
#monTranslationSystem
27-
#/opt/SLT.KIT/scripts/openNMT-py/Train.sh monoTransPrepro monTrans
17+
#Translate
18+
/opt/SLT.KIT/scripts/openNMT-py/Translate.sh $set monoTransPrepro mt

0 commit comments

Comments
 (0)