diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py index ca80ace281..74ad0604dc 100644 --- a/keras_nlp/models/albert/albert_tokenizer_test.py +++ b/keras_nlp/models/albert/albert_tokenizer_test.py @@ -28,14 +28,14 @@ def setUp(self): self.get_test_data_dir(), "albert_test_vocab.spm" ) } - self.input_data = ["the quick brown fox.", "the earth is round."] + self.input_data = ["the quick brown fox", "the earth is round"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=AlbertTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]], + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/bart/bart_tokenizer_test.py b/keras_nlp/models/bart/bart_tokenizer_test.py index acfdbc3d87..5a0015357b 100644 --- a/keras_nlp/models/bart/bart_tokenizer_test.py +++ b/keras_nlp/models/bart/bart_tokenizer_test.py @@ -37,7 +37,12 @@ def test_tokenizer_basics(self): cls=BartTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, + # TODO: should not get tokenized as expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]], + expected_detokenize_output=[ + " airplane at airport", + " airplane airport", + ], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/bert/bert_tokenizer_test.py b/keras_nlp/models/bert/bert_tokenizer_test.py index 29ed902a62..e53419dab4 100644 --- a/keras_nlp/models/bert/bert_tokenizer_test.py +++ b/keras_nlp/models/bert/bert_tokenizer_test.py @@ -24,20 +24,20 @@ def setUp(self): self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] self.init_kwargs = {"vocabulary": self.vocab} - self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."] + self.input_data = ["THE QUICK BROWN FOX", "THE FOX"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=BertTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]], + expected_output=[[5, 6, 7, 8], [5, 8]], ) def test_lowercase(self): tokenizer = BertTokenizer(vocabulary=self.vocab, lowercase=True) output = tokenizer(self.input_data) - self.assertAllEqual(output, [[9, 10, 11, 12, 1], [9, 12, 1]]) + self.assertAllEqual(output, [[9, 10, 11, 12], [9, 12]]) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py index fcaf637974..3c17cfa397 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py @@ -28,14 +28,14 @@ def setUp(self): ) self.tokenizer = DebertaV3Tokenizer(proto=proto) self.init_kwargs = {"proto": proto} - self.input_data = ["the quick brown fox.", "the earth is round."] + self.input_data = ["the quick brown fox", "the earth is round"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=DebertaV3Tokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]], + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py index b025b4e7fb..e4bfba41d3 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py @@ -26,20 +26,20 @@ def setUp(self): self.vocab += ["THE", "QUICK", "BROWN", "FOX"] self.vocab += ["the", "quick", "brown", "fox"] self.init_kwargs = {"vocabulary": self.vocab} - self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."] + self.input_data = ["THE QUICK BROWN FOX", "THE FOX"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=DistilBertTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]], + expected_output=[[5, 6, 7, 8], [5, 8]], ) def test_lowercase(self): tokenizer = DistilBertTokenizer(vocabulary=self.vocab, lowercase=True) output = tokenizer(self.input_data) - self.assertAllEqual(output, [[9, 10, 11, 12, 1], [9, 12, 1]]) + self.assertAllEqual(output, [[9, 10, 11, 12], [9, 12]]) def test_errors_missing_special_tokens(self): with self.assertRaises(ValueError): diff --git a/keras_nlp/models/f_net/f_net_tokenizer_test.py b/keras_nlp/models/f_net/f_net_tokenizer_test.py index 8d3511dee7..3dde34e849 100644 --- a/keras_nlp/models/f_net/f_net_tokenizer_test.py +++ b/keras_nlp/models/f_net/f_net_tokenizer_test.py @@ -28,14 +28,14 @@ def setUp(self): self.get_test_data_dir(), "f_net_test_vocab.spm" ) } - self.input_data = ["the quick brown fox.", "the earth is round."] + self.input_data = ["the quick brown fox", "the earth is round"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=FNetTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]], + expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/roberta/roberta_tokenizer_test.py b/keras_nlp/models/roberta/roberta_tokenizer_test.py index e5fcb1867d..3b2305608d 100644 --- a/keras_nlp/models/roberta/roberta_tokenizer_test.py +++ b/keras_nlp/models/roberta/roberta_tokenizer_test.py @@ -37,7 +37,12 @@ def test_tokenizer_basics(self): cls=RobertaTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, + # TODO: should not get tokenized as expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]], + expected_detokenize_output=[ + " airplane at airport", + " airplane airport", + ], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/t5/t5_tokenizer_test.py b/keras_nlp/models/t5/t5_tokenizer_test.py index be07b486e4..77ad734660 100644 --- a/keras_nlp/models/t5/t5_tokenizer_test.py +++ b/keras_nlp/models/t5/t5_tokenizer_test.py @@ -26,14 +26,14 @@ def setUp(self): # Generated using create_t5_test_proto.py "proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm") } - self.input_data = ["the quick brown fox.", "the earth is round."] + self.input_data = ["the quick brown fox", "the earth is round"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=T5Tokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[4, 9, 5, 2], [4, 6, 8, 2]], + expected_output=[[4, 9, 5, 7], [4, 6, 8, 10]], ) def test_errors_missing_special_tokens(self): diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index 5e7873a455..eacdcc7337 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -157,9 +157,6 @@ def tokenize(self, inputs): return tf.add(tokens, 1) def detokenize(self, inputs): - if inputs.dtype == tf.string: - return super().detokenize(inputs) - tokens = tf.ragged.boolean_mask( inputs, tf.not_equal(inputs, self.mask_token_id) ) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py index 9ec205c725..2057eff9eb 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py @@ -30,14 +30,14 @@ def setUp(self): self.get_test_data_dir(), "xlm_roberta_test_vocab.spm" ) } - self.input_data = ["the quick brown fox.", "the earth is round."] + self.input_data = ["the quick brown fox", "the earth is round"] def test_tokenizer_basics(self): self.run_preprocessing_layer_test( cls=XLMRobertaTokenizer, init_kwargs=self.init_kwargs, input_data=self.input_data, - expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]], + expected_output=[[6, 11, 7, 9], [6, 8, 10, 12]], ) @pytest.mark.large diff --git a/keras_nlp/tests/test_case.py b/keras_nlp/tests/test_case.py index ec29b2add6..6fe72ed497 100644 --- a/keras_nlp/tests/test_case.py +++ b/keras_nlp/tests/test_case.py @@ -24,6 +24,7 @@ from keras_nlp.backend import config from keras_nlp.backend import keras from keras_nlp.backend import ops +from keras_nlp.tokenizers.tokenizer import Tokenizer from keras_nlp.utils.tensor_utils import is_float_dtype from keras_nlp.utils.tensor_utils import standardize_dtype @@ -203,7 +204,7 @@ def run_preprocessing_layer_test( init_kwargs, input_data, expected_output=None, - batch_size=2, + expected_detokenize_output=None, ): """Run basic tests for a preprocessing layer.""" layer = cls(**init_kwargs) @@ -219,6 +220,13 @@ def run_preprocessing_layer_test( else: output = layer(input_data) + # For tokenizers only, also check detokenize. + if isinstance(layer, Tokenizer): + if not expected_detokenize_output: + expected_detokenize_output = input_data + detokenize_output = layer.detokenize(output) + self.assertAllEqual(detokenize_output, expected_detokenize_output) + # Run with an unbatched dataset. output_ds = ds.map(layer).ragged_batch(1_000) self.assertAllClose(output, output_ds.get_single_element())