MonoTranslationData

jniehues-kit · jniehues-kit · commit 1adfe7eb82f7 · 2018-01-15T15:40:42.000Z
diff --git a/scripts/monoTranslationData/RandCat_long.pl b/scripts/monoTranslationData/RandCat_long.pl
@@ -0,0 +1,56 @@
+#!/usr/bin/perl
+# remove lines in source and target texts if there are less than n words in a line
+# each line
+use strict;
+use utf8;
+open INPUT, "<$ARGV[0]"; 
+binmode INPUT, ":utf8"; 
+binmode STDOUT, ":utf8"; 
+binmode STDERR, ":utf8"; 
+
+my @words;
+
+my $range = 10; 
+my $offset = 20;
+my $rand; 
+my $text = "" ; 
+
+while (my $line = <INPUT>){
+   chomp($line);
+   my @words = split(/\s\s*/,$line); 
+   foreach my $word(@words){
+       chomp($word); 
+       $text .= $word;
+       $text .= " ";
+   } 
+}
+
+my @words = split(/\s\s*/,$text); 
+my $i = 0; 
+my $j; 
+
+my $out = ""; 
+
+#print scalar @words, "\n"; 
+
+while ($i < scalar @words){
+     
+    $rand = int(rand($range)) + $offset;
+    #print $i, "#"; 
+
+    for ($j = $i; $j < $rand + $i ; $j++){
+        print $words[$j];
+        print " ";
+    }
+    $i = $i + $rand;  
+    
+    #print $i, "#";
+    while ($words[$i] =~ /\p{P}/){
+	print $words[$i], " ";
+	$i++;
+    } 
+
+    print "\n";
+   
+}
+ 
diff --git a/scripts/monoTranslationData/Train.sh b/scripts/monoTranslationData/Train.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+
+input=$1
+name=$2
+
+mkdir -p /data/${name}
+mkdir -p /data/${name}
+
+for set in train valid
+do
+    
+mkdir -p /data/${name}/${set}
+for sub in pc randCut np
+do
+    
+mkdir -p /tmp/${name}/${sub}/${set}
+done
+done
+
+
+for set in train valid
+do
+    
+for f in /data/${input}/${set}/*\.s
+do
+    #remove BPE and xml escape
+    # remove double punctuation
+    cat $f | sed -e "s/@@ //g" | sed -e "s/&apos;/'/g" -e 's/&#124;/|/g' -e "s/&amp;/&/g" -e 's/&lt;/>/g' -e 's/&gt;/>/g' -e 's/&quot;/"/g' -e 's/&#91;/[/g' -e 's/&#93;/]/g' | sed -e 's/\.\.\././g' | sed -e 's/ \.\s*/. /g' | sed -e 's/ \.\s*/ /g' | sed -e 's/ \,\s*\,/ , /g' | sed -e 's/ \,\s*/, /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/ ! ! / ! /g' | sed -e 's/\s*!\s*/! /g' | sed -e 's/\s*?/?/g' | sed -e 's/\"\s*\"/""/g' | sed -e 's/ "/"/g' > /tmp/${name}/pc/${set}/${f##*/}
+    #randomly split data
+    perl /SLT.KIT/scripts/monoTranslationData/RandCat_long.pl /tmp/${name}/pc/${set}/${f##*/} | sed -e 's/\s*"/"/g' > /tmp/${name}/randCut/${set}/${f##*/}
+    #genereate Target Labels
+    filename=${f##*/}
+    perl /SLT.KIT/scripts/monoTranslationData/generateUL.pl /tmp/${name}/randCut/${set}/${f##*/} > /data/${name}/${set}/${filename%.*}.t
+    #remove punctuation and lowercase
+    cat /tmp/${name}/randCut/${set}/${f##*/} |  perl -nle 'print lc' | sed -e 's/\,//g' | sed -e 's/\.//g' | sed -e 's/?//g' | sed -e 's/\!//g' | sed -e 's/\"//g' | sed -e 's/^\s*//g' | sed -e 's/\s\s*/ /g' > /tmp/${name}/np/${set}/${f##*/}
+done
+done
+
+echo -n "" > /tmp/${name}/corpus
+
+set=train
+
+for f in /data/${input}/${set}/*\.s
+do
+    cat /tmp/${name}/np/${set}/${f##*/} | perl -nle 'print lc' >> /tmp/${name}/corpus
+    
+done
+
+
+
+#train BPE
+
+/opt/subword-nmt/learn_bpe.py -s 40000 -o /model/${name}/codec < /tmp/${name}/corpus
+/opt/subword-nmt/apply_bpe.py -c /model/${name}/codec  < /tmp/${name}/corpus | /opt/subword-nmt/get_vocab.py > /tmp/${name}/voc
+
+#apply BPE
+for set in valid train
+do
+    
+for f in /data/${input}/${set}/*\.s
+do
+
+    cat /tmp/${name}/np/${set}/${f##*/} | /opt/subword-nmt/apply_bpe.py -c /model/${name}/codec --vocabulary /tmp/${name}/voc --vocabulary-threshold 50 > /data/${name}/${set}/${f##*/}
+
+done
+
+done
+
+rm -r /tmp/${name}/
diff --git a/scripts/monoTranslationData/generateUL.pl b/scripts/monoTranslationData/generateUL.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl 
+
+use utf8; use strict; 
+
+open FILE, "<$ARGV[0]"; 
+binmode FILE, ":utf8"; 
+
+binmode STDOUT, ":utf8"; 
+
+while (my $line = <FILE>){
+  my @words = split(/\s\s*/, $line); 
+  foreach my $word(@words){ 
+    if ($word =~ /(\.|\,|\!|\?|\")$/){ 
+      my @chars = split(//, $word); 
+      my $fix = ""; 
+      my $i = -1; 
+      while ($chars[$i] =~ /(\.|\,|\!|\?|\")/){ 
+        $fix = $chars[$i].$fix; 
+        $i--; 
+      }
+      if ($word =~ /[A-Z]/){ 
+	print "U".$fix," "; 
+      } else { 
+	print "L".$fix, " "; 
+      }
+    } else { 
+      if ($word =~ /[A-Z]/){
+        print "U", " ";
+      } else {
+        print "L", " ";
+      }
+    } 
+  }  
+  print "\n"; 
+} 
+
diff --git a/systems/smallTED/Train.sh b/systems/smallTED/Train.sh
@@ -8,4 +8,12 @@
 /opt/SLT.KIT/scripts/defaultPreprocessor/Train.sh orig prepro
 
 
+#Train NMT
+/opt/SLT.KIT/scripts/openNMT-py/Train.sh prepro mt
 
+
+#Preprocess for Puncutation
+/SLT.KIT/scripts/monoTranslationData/Train.sh prepro monoTransPrepro
+
+#monTranslationSystem
+/opt/SLT.KIT/scripts/openNMT-py/Train.sh monoTransPrepro monTrans