From cda422e25871e8f3b3fed805fc236b16060ad200 Mon Sep 17 00:00:00 2001 From: Prasasto Adi Date: Thu, 31 Oct 2019 22:13:25 +0700 Subject: [PATCH] En-Id untokenized parallel corpora Change to untokenized parallel corpora due to t2t automatically tokenizes sentences. --- .../data_generators/translate_enid.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tensor2tensor/data_generators/translate_enid.py b/tensor2tensor/data_generators/translate_enid.py index bf598cae2..1aa2d2a2b 100644 --- a/tensor2tensor/data_generators/translate_enid.py +++ b/tensor2tensor/data_generators/translate_enid.py @@ -39,24 +39,24 @@ # http://www.panl10n.net/english/outputs/Indonesia/BPPT/0902/BPPTIndToEngCorpusHalfM.zip # pylint: disable=line-too-long _ENID_TRAIN_DATASETS = [ [ - _REPO + "IWSLT17.train.en-id.tok.tgz", - ("IWSLT17.train.en-id.tok.en", "IWSLT17.train.en-id.tok.id") + _REPO + "IWSLT17.train.en-id.tgz", + ("IWSLT17.train.en-id.en", "IWSLT17.train.en-id.id") ], [ - _REPO + "PANL-BPPT-ECO-EN-ID-150Kw.tok.tgz", - ("PANL-BPPT-ECO-EN-150Kw.tok.txt", "PANL-BPPT-ECO-ID-150Kw.tok.txt") + _REPO + "PANL-BPPT-ECO-EN-ID-150Kw.tgz", + ("PANL-BPPT-ECO-EN-150Kw.txt", "PANL-BPPT-ECO-ID-150Kw.txt") ], [ - _REPO + "PANL-BPPT-INT-EN-ID-150Kw.tok.tgz", - ("PANL-BPPT-INT-EN-150Kw.tok.txt", "PANL-BPPT-INT-ID-150Kw.tok.txt") + _REPO + "PANL-BPPT-INT-EN-ID-150Kw.tgz", + ("PANL-BPPT-INT-EN-150Kw.txt", "PANL-BPPT-INT-ID-150Kw.txt") ], [ - _REPO + "PANL-BPPT-SCI-EN-ID-100Kw.tok.tgz", - ("PANL-BPPT-SCI-EN-100Kw.tok.txt", "PANL-BPPT-SCI-ID-100Kw.tok.txt") + _REPO + "PANL-BPPT-SCI-EN-ID-100Kw.tgz", + ("PANL-BPPT-SCI-EN-100Kw.txt", "PANL-BPPT-SCI-ID-100Kw.txt") ], [ - _REPO + "PANL-BPPT-SPO-EN-ID-100Kw.tok.tgz", - ("PANL-BPPT-SPO-EN-100Kw.tok.txt", "PANL-BPPT-SPO-ID-100Kw.tok.txt") + _REPO + "PANL-BPPT-SPO-EN-ID-100Kw.tgz", + ("PANL-BPPT-SPO-EN-100Kw.txt", "PANL-BPPT-SPO-ID-100Kw.txt") ], ] @@ -65,9 +65,9 @@ # https://wit3.fbk.eu/mt.php?release=2017-01-more _ENID_TEST_DATASETS = [ [ - _REPO + "IWSLT17.TED.tst2017plus.en-id.tok.tgz", - ("IWSLT17.TED.tst2017plus.en-id.tok.en", - "IWSLT17.TED.tst2017plus.en-id.tok.id") + _REPO + "IWSLT17.TED.tst2017plus.en-id.tgz", + ("IWSLT17.TED.tst2017plus.en-id.en", + "IWSLT17.TED.tst2017plus.en-id.id") ] ]