Skip to content

Commit 1adfe7e

Browse files
committed
MonoTranslationData
1 parent 67505bc commit 1adfe7e

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/perl
2+
# remove lines in source and target texts if there are less than n words in a line
3+
# each line
4+
use strict;
5+
use utf8;
6+
open INPUT, "<$ARGV[0]";
7+
binmode INPUT, ":utf8";
8+
binmode STDOUT, ":utf8";
9+
binmode STDERR, ":utf8";
10+
11+
my @words;
12+
13+
my $range = 10;
14+
my $offset = 20;
15+
my $rand;
16+
my $text = "" ;
17+
18+
while (my $line = <INPUT>){
19+
chomp($line);
20+
my @words = split(/\s\s*/,$line);
21+
foreach my $word(@words){
22+
chomp($word);
23+
$text .= $word;
24+
$text .= " ";
25+
}
26+
}
27+
28+
my @words = split(/\s\s*/,$text);
29+
my $i = 0;
30+
my $j;
31+
32+
my $out = "";
33+
34+
#print scalar @words, "\n";
35+
36+
while ($i < scalar @words){
37+
38+
$rand = int(rand($range)) + $offset;
39+
#print $i, "#";
40+
41+
for ($j = $i; $j < $rand + $i ; $j++){
42+
print $words[$j];
43+
print " ";
44+
}
45+
$i = $i + $rand;
46+
47+
#print $i, "#";
48+
while ($words[$i] =~ /\p{P}/){
49+
print $words[$i], " ";
50+
$i++;
51+
}
52+
53+
print "\n";
54+
55+
}
56+

scripts/monoTranslationData/Train.sh

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
3+
4+
input=$1
5+
name=$2
6+
7+
mkdir -p /data/${name}
8+
mkdir -p /data/${name}
9+
10+
for set in train valid
11+
do
12+
13+
mkdir -p /data/${name}/${set}
14+
for sub in pc randCut np
15+
do
16+
17+
mkdir -p /tmp/${name}/${sub}/${set}
18+
done
19+
done
20+
21+
22+
for set in train valid
23+
do
24+
25+
for f in /data/${input}/${set}/*\.s
26+
do
27+
#remove BPE and xml escape
28+
# remove double punctuation
29+
cat $f | sed -e "s/@@ //g" | sed -e "s/&apos;/'/g" -e 's/&#124;/|/g' -e "s/&amp;/&/g" -e 's/&lt;/>/g' -e 's/&gt;/>/g' -e 's/&quot;/"/g' -e 's/&#91;/[/g' -e 's/&#93;/]/g' | sed -e 's/\.\.\././g' | sed -e 's/ \.\s*/. /g' | sed -e 's/ \.\s*/ /g' | sed -e 's/ \,\s*\,/ , /g' | sed -e 's/ \,\s*/, /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/\s*!\s*/! /g' | sed -e 's/\s*?/?/g' | sed -e 's/\"\s*\"/""/g' | sed -e 's/ "/"/g' > /tmp/${name}/pc/${set}/${f##*/}
30+
#randomly split data
31+
perl /SLT.KIT/scripts/monoTranslationData/RandCat_long.pl /tmp/${name}/pc/${set}/${f##*/} | sed -e 's/\s*"/"/g' > /tmp/${name}/randCut/${set}/${f##*/}
32+
#genereate Target Labels
33+
filename=${f##*/}
34+
perl /SLT.KIT/scripts/monoTranslationData/generateUL.pl /tmp/${name}/randCut/${set}/${f##*/} > /data/${name}/${set}/${filename%.*}.t
35+
#remove punctuation and lowercase
36+
cat /tmp/${name}/randCut/${set}/${f##*/} | perl -nle 'print lc' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' > /tmp/${name}/np/${set}/${f##*/}
37+
done
38+
done
39+
40+
echo -n "" > /tmp/${name}/corpus
41+
42+
set=train
43+
44+
for f in /data/${input}/${set}/*\.s
45+
do
46+
cat /tmp/${name}/np/${set}/${f##*/} | perl -nle 'print lc' >> /tmp/${name}/corpus
47+
48+
done
49+
50+
51+
52+
#train BPE
53+
54+
/opt/subword-nmt/learn_bpe.py -s 40000 -o /model/${name}/codec < /tmp/${name}/corpus
55+
/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec < /tmp/${name}/corpus | /opt/subword-nmt/get_vocab.py > /tmp/${name}/voc
56+
57+
#apply BPE
58+
for set in valid train
59+
do
60+
61+
for f in /data/${input}/${set}/*\.s
62+
do
63+
64+
cat /tmp/${name}/np/${set}/${f##*/} | /opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /tmp/${name}/voc --vocabulary-threshold 50 > /data/${name}/${set}/${f##*/}
65+
66+
done
67+
68+
done
69+
70+
rm -r /tmp/${name}/
+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#!/usr/bin/perl
2+
3+
use utf8; use strict;
4+
5+
open FILE, "<$ARGV[0]";
6+
binmode FILE, ":utf8";
7+
8+
binmode STDOUT, ":utf8";
9+
10+
while (my $line = <FILE>){
11+
my @words = split(/\s\s*/, $line);
12+
foreach my $word(@words){
13+
if ($word =~ /(\.|\,|\!|\?|\")$/){
14+
my @chars = split(//, $word);
15+
my $fix = "";
16+
my $i = -1;
17+
while ($chars[$i] =~ /(\.|\,|\!|\?|\")/){
18+
$fix = $chars[$i].$fix;
19+
$i--;
20+
}
21+
if ($word =~ /[A-Z]/){
22+
print "U".$fix," ";
23+
} else {
24+
print "L".$fix, " ";
25+
}
26+
} else {
27+
if ($word =~ /[A-Z]/){
28+
print "U", " ";
29+
} else {
30+
print "L", " ";
31+
}
32+
}
33+
}
34+
print "\n";
35+
}
36+

systems/smallTED/Train.sh

+8
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,12 @@
88
/opt/SLT.KIT/scripts/defaultPreprocessor/Train.sh orig prepro
99

1010

11+
#Train NMT
12+
/opt/SLT.KIT/scripts/openNMT-py/Train.sh prepro mt
1113

14+
15+
#Preprocess for Puncutation
16+
/SLT.KIT/scripts/monoTranslationData/Train.sh prepro monoTransPrepro
17+
18+
#monTranslationSystem
19+
/opt/SLT.KIT/scripts/openNMT-py/Train.sh monoTransPrepro monTrans

0 commit comments

Comments
 (0)