diff --git a/keras_nlp/models/albert/albert_classifier_test.py b/keras_nlp/models/albert/albert_classifier_test.py index e2581df6a1..ebf8a630eb 100644 --- a/keras_nlp/models/albert/albert_classifier_test.py +++ b/keras_nlp/models/albert/albert_classifier_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.albert.albert_backbone import AlbertBackbone from keras_nlp.models.albert.albert_classifier import AlbertClassifier @@ -27,26 +26,14 @@ class AlbertClassifierTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) self.preprocessor = AlbertPreprocessor( - AlbertTokenizer(proto=bytes_io.getvalue()), - sequence_length=5, + AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ), + sequence_length=5, + ) ) self.backbone = AlbertBackbone( vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(), diff --git a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py index 36eef72f39..79d3a36bbb 100644 --- a/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/albert/albert_masked_lm_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.albert.albert_masked_lm_preprocessor import ( AlbertMaskedLMPreprocessor, @@ -26,24 +25,12 @@ class AlbertMaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.tokenizer = AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) ) - self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. diff --git a/keras_nlp/models/albert/albert_masked_lm_test.py b/keras_nlp/models/albert/albert_masked_lm_test.py index 456b0edda4..f992ed2b37 100644 --- a/keras_nlp/models/albert/albert_masked_lm_test.py +++ b/keras_nlp/models/albert/albert_masked_lm_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.albert.albert_backbone import AlbertBackbone from keras_nlp.models.albert.albert_masked_lm import AlbertMaskedLM @@ -29,25 +28,14 @@ class AlbertMaskedLMTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) self.preprocessor = AlbertMaskedLMPreprocessor( - AlbertTokenizer(proto=bytes_io.getvalue()), + AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ), + sequence_length=5, + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, diff --git a/keras_nlp/models/albert/albert_preprocessor_test.py b/keras_nlp/models/albert/albert_preprocessor_test.py index 95cb2c832e..7d6fb4cfd4 100644 --- a/keras_nlp/models/albert/albert_preprocessor_test.py +++ b/keras_nlp/models/albert/albert_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer @@ -24,24 +23,12 @@ class AlbertPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.tokenizer = AlbertTokenizer( + # Generated using create_albert_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) ) - self.tokenizer = AlbertTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, "sequence_length": 8, diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py index e645436c09..ca80ace281 100644 --- a/keras_nlp/models/albert/albert_tokenizer_test.py +++ b/keras_nlp/models/albert/albert_tokenizer_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer from keras_nlp.tests.test_case import TestCase @@ -23,24 +22,12 @@ class AlbertTokenizerTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.init_kwargs = {"proto": bytes_io.getvalue()} + self.init_kwargs = { + # Generated using create_albert_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "albert_test_vocab.spm" + ) + } self.input_data = ["the quick brown fox.", "the earth is round."] def test_tokenizer_basics(self): @@ -52,17 +39,13 @@ def test_tokenizer_basics(self): ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - AlbertTokenizer(proto=bytes_io.getvalue()) + AlbertTokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) @pytest.mark.large def test_smallest_preset(self): diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py index 046c18dd5e..0e0ab7642d 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.models.deberta_v3.deberta_v3_classifier import ( @@ -31,25 +30,13 @@ class DebertaV3ClassifierTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) self.preprocessor = DebertaV3Preprocessor( - DebertaV3Tokenizer(proto=bytes_io.getvalue()), + DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) + ), sequence_length=5, ) self.backbone = DebertaV3Backbone( diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py index faf1ee1a8f..217980ea59 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.deberta_v3.deberta_v3_masked_lm_preprocessor import ( DebertaV3MaskedLMPreprocessor, @@ -26,24 +25,12 @@ class DebertaV3MaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", + self.tokenizer = DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) ) - self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py index 62f84b508c..32bf71de13 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.deberta_v3.deberta_v3_backbone import DebertaV3Backbone from keras_nlp.models.deberta_v3.deberta_v3_masked_lm import DebertaV3MaskedLM @@ -29,25 +28,13 @@ class DebertaV3MaskedLMTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", - ) self.preprocessor = DebertaV3MaskedLMPreprocessor( - DebertaV3Tokenizer(proto=bytes_io.getvalue()), + DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, diff --git a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py index f6f648ab83..a50022f3c7 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.deberta_v3.deberta_v3_preprocessor import ( DebertaV3Preprocessor, @@ -26,24 +25,12 @@ class DebertaV3PreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", - user_defined_symbols="[MASK]", + self.tokenizer = DebertaV3Tokenizer( + # Generated using create_deberta_v3_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" + ) ) - self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, "sequence_length": 8, diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py index c542de786d..fcaf637974 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.deberta_v3.deberta_v3_tokenizer import DebertaV3Tokenizer from keras_nlp.tests.test_case import TestCase @@ -23,24 +22,12 @@ class DebertaV3TokenizerTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - pad_id=0, - bos_id=1, - eos_id=2, - unk_id=3, - pad_piece="[PAD]", - bos_piece="[CLS]", - eos_piece="[SEP]", - unk_piece="[UNK]", + # Generated using create_deberta_v3_test_proto.py + proto = os.path.join( + self.get_test_data_dir(), "deberta_v3_test_vocab.spm" ) - self.tokenizer = DebertaV3Tokenizer(proto=bytes_io.getvalue()) - self.init_kwargs = {"proto": bytes_io.getvalue()} + self.tokenizer = DebertaV3Tokenizer(proto=proto) + self.init_kwargs = {"proto": proto} self.input_data = ["the quick brown fox.", "the earth is round."] def test_tokenizer_basics(self): @@ -48,28 +35,24 @@ def test_tokenizer_basics(self): cls=DebertaV3Tokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[4, 9, 5, 3], [4, 6, 8, 3]], + expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]], ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - DebertaV3Tokenizer(proto=bytes_io.getvalue()) + DebertaV3Tokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) def test_mask_token_handling(self): tokenizer = DebertaV3Tokenizer(**self.init_kwargs) - self.assertEqual(tokenizer.get_vocabulary()[11], "[MASK]") - self.assertEqual(tokenizer.id_to_token(11), "[MASK]") - self.assertEqual(tokenizer.token_to_id("[MASK]"), 11) - input_data = [[4, 9, 5, 7, self.tokenizer.mask_token_id]] + self.assertEqual(tokenizer.get_vocabulary()[4], "[MASK]") + self.assertEqual(tokenizer.id_to_token(4), "[MASK]") + self.assertEqual(tokenizer.token_to_id("[MASK]"), 4) + input_data = [[5, 10, 6, 8, self.tokenizer.mask_token_id]] output = tokenizer.detokenize(input_data) self.assertEqual(output, ["the quick brown fox"]) diff --git a/keras_nlp/models/f_net/f_net_classifier_test.py b/keras_nlp/models/f_net/f_net_classifier_test.py index b972f64655..c871fbcc7b 100644 --- a/keras_nlp/models/f_net/f_net_classifier_test.py +++ b/keras_nlp/models/f_net/f_net_classifier_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.models.f_net.f_net_classifier import FNetClassifier @@ -27,25 +26,13 @@ class FNetClassifierTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) self.preprocessor = FNetPreprocessor( - FNetTokenizer(proto=bytes_io.getvalue()), + FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + ), sequence_length=5, ) self.backbone = FNetBackbone( diff --git a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py index eb7036005a..5f72081a0d 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/f_net/f_net_masked_lm_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.f_net.f_net_masked_lm_preprocessor import ( FNetMaskedLMPreprocessor, @@ -26,24 +25,10 @@ class FNetMaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.tokenizer = FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm") ) - self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. diff --git a/keras_nlp/models/f_net/f_net_masked_lm_test.py b/keras_nlp/models/f_net/f_net_masked_lm_test.py index dc8bb8e9b3..b4931a76fc 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm_test.py +++ b/keras_nlp/models/f_net/f_net_masked_lm_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.f_net.f_net_backbone import FNetBackbone from keras_nlp.models.f_net.f_net_masked_lm import FNetMaskedLM @@ -29,25 +28,13 @@ class FNetMaskedLMTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) self.preprocessor = FNetMaskedLMPreprocessor( - FNetTokenizer(proto=bytes_io.getvalue()), + FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, diff --git a/keras_nlp/models/f_net/f_net_preprocessor_test.py b/keras_nlp/models/f_net/f_net_preprocessor_test.py index f5470c700d..f67737c828 100644 --- a/keras_nlp/models/f_net/f_net_preprocessor_test.py +++ b/keras_nlp/models/f_net/f_net_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.f_net.f_net_preprocessor import FNetPreprocessor from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer @@ -24,24 +23,10 @@ class FNetPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.tokenizer = FNetTokenizer( + # Generated using create_f_net_test_proto.py + proto=os.path.join(self.get_test_data_dir(), "f_net_test_vocab.spm") ) - self.tokenizer = FNetTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, "sequence_length": 8, diff --git a/keras_nlp/models/f_net/f_net_tokenizer_test.py b/keras_nlp/models/f_net/f_net_tokenizer_test.py index 80b7f9e037..8d3511dee7 100644 --- a/keras_nlp/models/f_net/f_net_tokenizer_test.py +++ b/keras_nlp/models/f_net/f_net_tokenizer_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.f_net.f_net_tokenizer import FNetTokenizer from keras_nlp.tests.test_case import TestCase @@ -23,24 +22,12 @@ class FNetTokenizerTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", - ) - self.init_kwargs = {"proto": bytes_io.getvalue()} + self.init_kwargs = { + # Generated using create_f_net_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "f_net_test_vocab.spm" + ) + } self.input_data = ["the quick brown fox.", "the earth is round."] def test_tokenizer_basics(self): @@ -52,17 +39,13 @@ def test_tokenizer_basics(self): ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - FNetTokenizer(proto=bytes_io.getvalue()) + FNetTokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) @pytest.mark.large def test_smallest_preset(self): diff --git a/keras_nlp/models/t5/t5_tokenizer_test.py b/keras_nlp/models/t5/t5_tokenizer_test.py index 9f6f4e9e8f..be07b486e4 100644 --- a/keras_nlp/models/t5/t5_tokenizer_test.py +++ b/keras_nlp/models/t5/t5_tokenizer_test.py @@ -12,11 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece -import tensorflow as tf from keras_nlp.models.t5.t5_tokenizer import T5Tokenizer from keras_nlp.tests.test_case import TestCase @@ -24,25 +22,10 @@ class T5TokenizerTest(TestCase): def setUp(self): - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox", "the earth is round"] - ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - bos_id=-1, - pad_id=0, - eos_id=1, - unk_id=2, - pad_piece="", - eos_piece="", - unk_piece="", - user_defined_symbols="[MASK]", - ) - self.init_kwargs = {"proto": bytes_io.getvalue()} + self.init_kwargs = { + # Generated using create_t5_test_proto.py + "proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm") + } self.input_data = ["the quick brown fox.", "the earth is round."] def test_tokenizer_basics(self): @@ -54,17 +37,13 @@ def test_tokenizer_basics(self): ) def test_errors_missing_special_tokens(self): - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(["abc"]), - model_writer=bytes_io, - vocab_size=5, - pad_id=-1, - eos_id=-1, - bos_id=-1, - ) with self.assertRaises(ValueError): - T5Tokenizer(proto=bytes_io.getvalue()) + T5Tokenizer( + # Generated using create_no_special_token_proto.py + proto=os.path.join( + self.get_test_data_dir(), "no_special_token_vocab.spm" + ) + ) @pytest.mark.large def test_smallest_preset(self): diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py index c123cc6bc0..8255a40cf5 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone from keras_nlp.models.xlm_roberta.xlm_roberta_classifier import ( @@ -33,19 +32,13 @@ class XLMRobertaClassifierTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=10, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - ) self.preprocessor = XLMRobertaPreprocessor( - XLMRobertaTokenizer(proto=bytes_io.getvalue()), + XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + ), sequence_length=5, ) self.backbone = XLMRobertaBackbone( diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py index 6dd0bc0f71..c1bfc7242a 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm_preprocessor import ( XLMRobertaMaskedLMPreprocessor, @@ -28,19 +27,12 @@ class XLMRobertaMaskedLMPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - user_defined_symbols="[MASK]", + self.tokenizer = XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) ) - self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, # Simplify our testing by masking every available token. @@ -59,11 +51,11 @@ def test_preprocessor_basics(self): input_data=self.input_data, expected_output=( { - "token_ids": [[0, 12, 12, 12, 12, 2, 1, 1, 1, 1, 1, 1]], + "token_ids": [[0, 13, 13, 13, 13, 2, 1, 1, 1, 1, 1, 1]], "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], "mask_positions": [[1, 2, 3, 4]], }, - [[5, 10, 6, 8]], + [[6, 11, 7, 9]], [[1.0, 1.0, 1.0, 1.0]], ), ) @@ -80,7 +72,7 @@ def test_no_masking_zero_rate(self): no_mask_preprocessor(input_data), ( { - "token_ids": [[0, 5, 10, 6, 8, 2, 1, 1, 1, 1, 1, 1]], + "token_ids": [[0, 6, 11, 7, 9, 2, 1, 1, 1, 1, 1, 1]], "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]], "mask_positions": [[0, 0, 0, 0]], }, diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py index 81fafbe4dc..bcbafe4ad9 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.xlm_roberta.xlm_roberta_backbone import XLMRobertaBackbone from keras_nlp.models.xlm_roberta.xlm_roberta_masked_lm import ( @@ -33,20 +32,13 @@ class XLMRobertaMaskedLMTest(TestCase): def setUp(self): # Setup model. - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - user_defined_symbols="[MASK]", - ) self.preprocessor = XLMRobertaMaskedLMPreprocessor( - XLMRobertaTokenizer(proto=bytes_io.getvalue()), + XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + ), # Simplify our testing by masking every available token. mask_selection_rate=1.0, mask_token_rate=1.0, @@ -76,7 +68,7 @@ def test_masked_lm_basics(self): cls=XLMRobertaMaskedLM, init_kwargs=self.init_kwargs, train_data=self.train_data, - expected_output_shape=(2, 5, 13), + expected_output_shape=(2, 5, 14), ) @pytest.mark.large diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py index 38eb4882f3..3c3bbf2612 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor import ( XLMRobertaPreprocessor, @@ -28,24 +27,12 @@ class XLMRobertaPreprocessorTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=12, - model_type="WORD", - pad_id=0, - unk_id=1, - bos_id=2, - eos_id=3, - pad_piece="", - unk_piece="", - bos_piece="[CLS]", - eos_piece="[SEP]", - user_defined_symbols="[MASK]", + self.tokenizer = XLMRobertaTokenizer( + # Generated using create_xlm_roberta_test_proto.py + proto=os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) ) - self.tokenizer = XLMRobertaTokenizer(proto=bytes_io.getvalue()) self.init_kwargs = { "tokenizer": self.tokenizer, "sequence_length": 8, diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py index a58ee4c74b..9ec205c725 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io +import os import pytest -import sentencepiece from keras_nlp.models.xlm_roberta.xlm_roberta_tokenizer import ( XLMRobertaTokenizer, @@ -25,19 +24,12 @@ class XLMRobertaTokenizerTest(TestCase): def setUp(self): - vocab_data = ["the quick brown fox", "the earth is round"] - bytes_io = io.BytesIO() - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=iter(vocab_data), - model_writer=bytes_io, - vocab_size=11, - model_type="WORD", - unk_id=0, - bos_id=1, - eos_id=2, - user_defined_symbols="[MASK]", - ) - self.init_kwargs = {"proto": bytes_io.getvalue()} + self.init_kwargs = { + # Generated using create_xlm_roberta_test_proto.py + "proto": os.path.join( + self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" + ) + } self.input_data = ["the quick brown fox.", "the earth is round."] def test_tokenizer_basics(self): @@ -45,7 +37,7 @@ def test_tokenizer_basics(self): cls=XLMRobertaTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]], + expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]], ) @pytest.mark.large diff --git a/keras_nlp/tests/test_case.py b/keras_nlp/tests/test_case.py index 2025f3ad13..ec29b2add6 100644 --- a/keras_nlp/tests/test_case.py +++ b/keras_nlp/tests/test_case.py @@ -14,6 +14,7 @@ import json import os +import pathlib import re import tensorflow as tf @@ -417,3 +418,6 @@ def compare(actual, expected): self.assertAllClose(actual, expected, atol=0.01, rtol=0.01) tree.map_structure(compare, output, expected_partial_output) + + def get_test_data_dir(self): + return str(pathlib.Path(__file__).parent / "test_data") diff --git a/keras_nlp/tests/test_data/albert_test_vocab.spm b/keras_nlp/tests/test_data/albert_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/albert_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm b/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm new file mode 100644 index 0000000000..1c4aa4bbb8 Binary files /dev/null and b/keras_nlp/tests/test_data/deberta_v3_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/f_net_test_vocab.spm b/keras_nlp/tests/test_data/f_net_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/f_net_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/no_special_token_vocab.spm b/keras_nlp/tests/test_data/no_special_token_vocab.spm new file mode 100644 index 0000000000..582613aba7 Binary files /dev/null and b/keras_nlp/tests/test_data/no_special_token_vocab.spm differ diff --git a/keras_nlp/tests/test_data/t5_test_vocab.spm b/keras_nlp/tests/test_data/t5_test_vocab.spm new file mode 100644 index 0000000000..ce7ecf0c49 Binary files /dev/null and b/keras_nlp/tests/test_data/t5_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/tokenizer_test_vocab.spm b/keras_nlp/tests/test_data/tokenizer_test_vocab.spm new file mode 100644 index 0000000000..ec895a9d67 Binary files /dev/null and b/keras_nlp/tests/test_data/tokenizer_test_vocab.spm differ diff --git a/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm b/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm new file mode 100644 index 0000000000..8520ca4919 Binary files /dev/null and b/keras_nlp/tests/test_data/xlm_roberta_test_vocab.spm differ diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py b/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py index e488f1d0c1..f3b39711bd 100644 --- a/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/sentence_piece_tokenizer_test.py @@ -12,10 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os -import sentencepiece import tensorflow as tf from keras_nlp.tests.test_case import TestCase @@ -25,17 +23,9 @@ class SentencePieceTokenizerTest(TestCase): def setUp(self): super().setUp() - bytes_io = io.BytesIO() - vocab_data = tf.data.Dataset.from_tensor_slices( - ["the quick brown fox."] + self.proto = os.path.join( + self.get_test_data_dir(), "tokenizer_test_vocab.spm" ) - sentencepiece.SentencePieceTrainer.train( - sentence_iterator=vocab_data.as_numpy_iterator(), - model_writer=bytes_io, - vocab_size=7, - model_type="WORD", - ) - self.proto = bytes_io.getvalue() def test_tokenize(self): input_data = ["the quick brown fox."] @@ -112,15 +102,13 @@ def test_error_id_out_of_vocabulary(self): with self.assertRaises(ValueError): tokenizer.id_to_token(-1) - def test_from_file(self): - filepath = os.path.join(self.get_temp_dir(), "model.txt") - input_data = ["the quick brown fox."] - with tf.io.gfile.GFile(filepath, "wb") as file: - file.write(self.proto) + def test_from_bytes(self): + with tf.io.gfile.GFile(self.proto, "rb") as file: + proto = file.read() tokenizer = SentencePieceTokenizer( - proto=filepath, + proto=proto, ) - output_data = tokenizer(input_data) + output_data = tokenizer(["the quick brown fox."]) self.assertAllEqual(output_data, [[6, 5, 3, 4]]) def test_tokenize_then_batch(self): diff --git a/tools/sentencepiece_testing/__init__.py b/tools/sentencepiece_testing/__init__.py new file mode 100644 index 0000000000..ba0c2545e4 --- /dev/null +++ b/tools/sentencepiece_testing/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tools/sentencepiece_testing/create_albert_test_proto.py b/tools/sentencepiece_testing/create_albert_test_proto.py new file mode 100644 index 0000000000..80e82b3cd1 --- /dev/null +++ b/tools/sentencepiece_testing/create_albert_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "albert_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_deberta_v3_test_proto.py b/tools/sentencepiece_testing/create_deberta_v3_test_proto.py new file mode 100644 index 0000000000..c3f98867c5 --- /dev/null +++ b/tools/sentencepiece_testing/create_deberta_v3_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "deberta_v3_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + bos_id=1, + eos_id=2, + unk_id=3, + pad_piece="[PAD]", + bos_piece="[CLS]", + eos_piece="[SEP]", + unk_piece="[UNK]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_f_net_test_proto.py b/tools/sentencepiece_testing/create_f_net_test_proto.py new file mode 100644 index 0000000000..949a5692f9 --- /dev/null +++ b/tools/sentencepiece_testing/create_f_net_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "f_net_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_no_special_token_proto.py b/tools/sentencepiece_testing/create_no_special_token_proto.py new file mode 100644 index 0000000000..c13ef6e05a --- /dev/null +++ b/tools/sentencepiece_testing/create_no_special_token_proto.py @@ -0,0 +1,30 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["abc"], + "no_special_token_vocab.spm", + vocab_size=5, + pad_id=-1, + eos_id=-1, + bos_id=-1, + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py b/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py new file mode 100644 index 0000000000..a40eade848 --- /dev/null +++ b/tools/sentencepiece_testing/create_sentence_piece_tokenizer_proto.py @@ -0,0 +1,28 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox."], + "tokenizer_test_vocab.spm", + vocab_size=7, + model_type="WORD", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_t5_test_proto.py b/tools/sentencepiece_testing/create_t5_test_proto.py new file mode 100644 index 0000000000..b7e28160e5 --- /dev/null +++ b/tools/sentencepiece_testing/create_t5_test_proto.py @@ -0,0 +1,36 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "t5_test_vocab.spm", + vocab_size=11, + model_type="WORD", + bos_id=-1, + pad_id=0, + eos_id=1, + unk_id=2, + pad_piece="", + eos_piece="", + unk_piece="", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py b/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py new file mode 100644 index 0000000000..988d161f99 --- /dev/null +++ b/tools/sentencepiece_testing/create_xlm_roberta_test_proto.py @@ -0,0 +1,37 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tools.sentencepiece_testing.utils import train_sentencepiece + + +def main(): + train_sentencepiece( + ["the quick brown fox", "the earth is round"], + "xlm_roberta_test_vocab.spm", + vocab_size=12, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + user_defined_symbols="[MASK]", + ) + + +if __name__ == "__main__": + main() diff --git a/tools/sentencepiece_testing/utils.py b/tools/sentencepiece_testing/utils.py new file mode 100644 index 0000000000..9deebd9737 --- /dev/null +++ b/tools/sentencepiece_testing/utils.py @@ -0,0 +1,33 @@ +# Copyright 2023 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +import pathlib + +import sentencepiece + + +def train_sentencepiece(data, filename, *args, **kwargs): + bytes_io = io.BytesIO() + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=iter(data), model_writer=bytes_io, *args, **kwargs + ) + with open( + pathlib.Path(__file__).parent.parent.parent + / "keras_nlp" + / "tests" + / "test_data" + / filename, + mode="wb", + ) as f: + f.write(bytes_io.getbuffer())