Skip to content

Commit 08a96e2

Browse files
committed
Add detokenize testing for model tokenizers
1 parent 6b66ad8 commit 08a96e2

File tree

11 files changed

+33
-18
lines changed

11 files changed

+33
-18
lines changed

keras_nlp/models/albert/albert_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def setUp(self):
2828
self.get_test_data_dir(), "albert_test_vocab.spm"
2929
)
3030
}
31-
self.input_data = ["the quick brown fox.", "the earth is round."]
31+
self.input_data = ["the quick brown fox", "the earth is round"]
3232

3333
def test_tokenizer_basics(self):
3434
self.run_preprocessing_layer_test(
3535
cls=AlbertTokenizer,
3636
init_kwargs=self.init_kwargs,
3737
input_data=self.input_data,
38-
expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]],
38+
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
3939
)
4040

4141
def test_errors_missing_special_tokens(self):

keras_nlp/models/bart/bart_tokenizer_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,12 @@ def test_tokenizer_basics(self):
3737
cls=BartTokenizer,
3838
init_kwargs=self.init_kwargs,
3939
input_data=self.input_data,
40+
# TODO: </s> should not get tokenized as <s>
4041
expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]],
42+
expected_detokenize_output=[
43+
"<s> airplane at airport<s><pad>",
44+
" airplane airport",
45+
],
4146
)
4247

4348
def test_errors_missing_special_tokens(self):

keras_nlp/models/bert/bert_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@ def setUp(self):
2424
self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
2525
self.vocab += ["the", "quick", "brown", "fox"]
2626
self.init_kwargs = {"vocabulary": self.vocab}
27-
self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."]
27+
self.input_data = ["THE QUICK BROWN FOX", "THE FOX"]
2828

2929
def test_tokenizer_basics(self):
3030
self.run_preprocessing_layer_test(
3131
cls=BertTokenizer,
3232
init_kwargs=self.init_kwargs,
3333
input_data=self.input_data,
34-
expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]],
34+
expected_output=[[5, 6, 7, 8], [5, 8]],
3535
)
3636

3737
def test_lowercase(self):

keras_nlp/models/deberta_v3/deberta_v3_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def setUp(self):
2828
)
2929
self.tokenizer = DebertaV3Tokenizer(proto=proto)
3030
self.init_kwargs = {"proto": proto}
31-
self.input_data = ["the quick brown fox.", "the earth is round."]
31+
self.input_data = ["the quick brown fox", "the earth is round"]
3232

3333
def test_tokenizer_basics(self):
3434
self.run_preprocessing_layer_test(
3535
cls=DebertaV3Tokenizer,
3636
init_kwargs=self.init_kwargs,
3737
input_data=self.input_data,
38-
expected_output=[[5, 10, 6, 3], [5, 7, 9, 3]],
38+
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
3939
)
4040

4141
def test_errors_missing_special_tokens(self):

keras_nlp/models/distil_bert/distil_bert_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ def setUp(self):
2626
self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
2727
self.vocab += ["the", "quick", "brown", "fox"]
2828
self.init_kwargs = {"vocabulary": self.vocab}
29-
self.input_data = ["THE QUICK BROWN FOX.", "THE FOX."]
29+
self.input_data = ["THE QUICK BROWN FOX", "THE FOX"]
3030

3131
def test_tokenizer_basics(self):
3232
self.run_preprocessing_layer_test(
3333
cls=DistilBertTokenizer,
3434
init_kwargs=self.init_kwargs,
3535
input_data=self.input_data,
36-
expected_output=[[5, 6, 7, 8, 1], [5, 8, 1]],
36+
expected_output=[[5, 6, 7, 8], [5, 8]],
3737
)
3838

3939
def test_lowercase(self):

keras_nlp/models/f_net/f_net_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,14 +28,14 @@ def setUp(self):
2828
self.get_test_data_dir(), "f_net_test_vocab.spm"
2929
)
3030
}
31-
self.input_data = ["the quick brown fox.", "the earth is round."]
31+
self.input_data = ["the quick brown fox", "the earth is round"]
3232

3333
def test_tokenizer_basics(self):
3434
self.run_preprocessing_layer_test(
3535
cls=FNetTokenizer,
3636
init_kwargs=self.init_kwargs,
3737
input_data=self.input_data,
38-
expected_output=[[5, 10, 6, 1], [5, 7, 9, 1]],
38+
expected_output=[[5, 10, 6, 8], [5, 7, 9, 11]],
3939
)
4040

4141
def test_errors_missing_special_tokens(self):

keras_nlp/models/roberta/roberta_tokenizer_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,12 @@ def test_tokenizer_basics(self):
3737
cls=RobertaTokenizer,
3838
init_kwargs=self.init_kwargs,
3939
input_data=self.input_data,
40+
# TODO: </s> should not get tokenized as <s>
4041
expected_output=[[0, 4, 5, 6, 4, 7, 0, 1], [4, 5, 4, 7]],
42+
expected_detokenize_output=[
43+
"<s> airplane at airport<s><pad>",
44+
" airplane airport",
45+
],
4146
)
4247

4348
def test_errors_missing_special_tokens(self):

keras_nlp/models/t5/t5_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,14 @@ def setUp(self):
2626
# Generated using create_t5_test_proto.py
2727
"proto": os.path.join(self.get_test_data_dir(), "t5_test_vocab.spm")
2828
}
29-
self.input_data = ["the quick brown fox.", "the earth is round."]
29+
self.input_data = ["the quick brown fox", "the earth is round"]
3030

3131
def test_tokenizer_basics(self):
3232
self.run_preprocessing_layer_test(
3333
cls=T5Tokenizer,
3434
init_kwargs=self.init_kwargs,
3535
input_data=self.input_data,
36-
expected_output=[[4, 9, 5, 2], [4, 6, 8, 2]],
36+
expected_output=[[4, 9, 5, 7], [4, 6, 8, 10]],
3737
)
3838

3939
def test_errors_missing_special_tokens(self):

keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,6 @@ def tokenize(self, inputs):
157157
return tf.add(tokens, 1)
158158

159159
def detokenize(self, inputs):
160-
if inputs.dtype == tf.string:
161-
return super().detokenize(inputs)
162-
163160
tokens = tf.ragged.boolean_mask(
164161
inputs, tf.not_equal(inputs, self.mask_token_id)
165162
)

keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,14 +30,14 @@ def setUp(self):
3030
self.get_test_data_dir(), "xlm_roberta_test_vocab.spm"
3131
)
3232
}
33-
self.input_data = ["the quick brown fox.", "the earth is round."]
33+
self.input_data = ["the quick brown fox", "the earth is round"]
3434

3535
def test_tokenizer_basics(self):
3636
self.run_preprocessing_layer_test(
3737
cls=XLMRobertaTokenizer,
3838
init_kwargs=self.init_kwargs,
3939
input_data=self.input_data,
40-
expected_output=[[6, 11, 7, 2], [6, 8, 10, 2]],
40+
expected_output=[[6, 11, 7, 9], [6, 8, 10, 12]],
4141
)
4242

4343
@pytest.mark.large

0 commit comments

Comments
 (0)