diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py index f954925ce3..94efdd523e 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py @@ -56,10 +56,13 @@ class DistilBertTokenizer(WordPieceTokenizer): "distil_bert_base_en_uncased", ) tokenizer("The quick brown fox jumped.") + # Batched input. tokenizer(["The quick brown fox jumped.", "The fox slept."]) + # Detokenization. tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) + # Custom vocabulary. vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] vocab += ["The", "quick", "brown", "fox", "jumped", "."] diff --git a/keras_nlp/models/f_net/f_net_preprocessor.py b/keras_nlp/models/f_net/f_net_preprocessor.py index 21c6ec7f1b..731dd96548 100644 --- a/keras_nlp/models/f_net/f_net_preprocessor.py +++ b/keras_nlp/models/f_net/f_net_preprocessor.py @@ -69,10 +69,8 @@ class FNetPreprocessor(Preprocessor): Directly calling the from_preset(). ```python - tokenizer = keras_nlp.models.FNetTokenizer(proto="model.spm") - preprocessor = keras_nlp.models.FNetPreprocessor( - tokenizer=tokenizer, - sequence_length=10, + preprocessor = keras_nlp.models.FNetPreprocessor.from_preset( + "f_net_base_en" ) # Tokenize and pack a single sentence. @@ -86,10 +84,6 @@ class FNetPreprocessor(Preprocessor): first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) second = tf.constant(["The fox tripped.", "Oh look, a whale."]) preprocessor((first, second)) - - - preprocessor = keras_nlp.models.FNetPreprocessor(tokenizer) - preprocessor("The quick brown fox jumped.") ``` Mapping with `tf.data.Dataset`. diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py index 70b2e72b70..bc70817ed7 100644 --- a/keras_nlp/models/preprocessor.py +++ b/keras_nlp/models/preprocessor.py @@ -67,18 +67,10 @@ def from_preset( Examples: ```python - # Load preprocessor from preset + # Load a preprocessor layer from a preset. preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( "{{example_preset_name}}", ) - preprocessor("The quick brown fox jumped.") - - # Override sequence_length - preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( - "{{example_preset_name}}", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") ``` """ if not cls.presets: diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py index 6a7b76d257..0ea8df5a36 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py @@ -75,6 +75,8 @@ class RobertaMaskedLMPreprocessor(RobertaPreprocessor): generates label weights. Examples: + + Directly calling the layer on data. ```python # Load the preprocessor from a preset. preprocessor = keras_nlp.models.RobertaMaskedLMPreprocessor.from_preset( diff --git a/keras_nlp/models/t5/t5_tokenizer.py b/keras_nlp/models/t5/t5_tokenizer.py index d6c058c577..5eb2437b5d 100644 --- a/keras_nlp/models/t5/t5_tokenizer.py +++ b/keras_nlp/models/t5/t5_tokenizer.py @@ -43,7 +43,25 @@ class T5Tokenizer(SentencePieceTokenizer): Examples: ```python - tokenizer = keras_nlp.models.T5Tokenizer(proto="model.spm") + bytes_io = io.BytesIO() + ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."]) + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=ds.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=8, + model_type="WORD", + bos_id=-1, + pad_id=0, + eos_id=1, + unk_id=2, + pad_piece="", + eos_piece="", + unk_piece="", + ) + tokenizer = keras_nlp.models.T5Tokenizer( + proto=bytes_io.getvalue(), + ) + tokenizer("The quick brown fox jumped.") # Batched inputs. tokenizer(["the quick brown fox", "the earth is round"]) @@ -52,7 +70,7 @@ class T5Tokenizer(SentencePieceTokenizer): tokenizer("the quick brown fox") # Detokenization. - tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]])) + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) ``` """ diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py index 2ed7baa0bf..81d94f8de3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py @@ -77,6 +77,8 @@ class XLMRobertaMaskedLMPreprocessor(XLMRobertaPreprocessor): generates label weights. Examples: + + Directly calling the layer on data. ```python # Load the preprocessor from a preset. preprocessor = keras_nlp.models.XLMRobertaMaskedLMPreprocessor.from_preset( diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index 6ecb5016e7..122c372c01 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -74,6 +74,8 @@ class XLMRobertaPreprocessor(Preprocessor): sample_weight: Any label weight data. Will be passed through unaltered. Examples: + + Directly calling the layer on data. ```python preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( "xlm_roberta_base_multi" diff --git a/keras_nlp/samplers/beam_sampler.py b/keras_nlp/samplers/beam_sampler.py index 2830640d7e..a3970223b1 100644 --- a/keras_nlp/samplers/beam_sampler.py +++ b/keras_nlp/samplers/beam_sampler.py @@ -73,22 +73,24 @@ def next(prompt, cache, index): char_lookup = {v: k for k, v in int_lookup.items()} batch_size, length, vocab_size = 1, 8, len(int_lookup) - def next(prompt, state, index): + def next(prompt, cache, index): + prompt_batch_size = tf.shape(prompt)[0] + hidden_states = tf.ones((prompt_batch_size, 10)) # A uniform distribution over our alphabet. logits = tf.ones((batch_size, vocab_size)) - return logits, state + return logits, hidden_states, cache - output = keras_nlp.samplers.BeamSampler(return_all_beams=True)( + beams, probs = keras_nlp.samplers.BeamSampler(return_all_beams=True)( next=next, prompt=tf.fill((batch_size, length,), char_lookup['z']), index=5, ) - print(output[0].shape) + print(beams.shape) # >>> (1, 5, 8) - print(output[1].shape) + print(probs.shape) # >>> (1, 5) - print(["".join([int_lookup[i] for i in s]) for s in output.numpy()]) + print(["".join([int_lookup[i] for i in s]) for s in beams[0].numpy()]) # >>> ['zzzzzeee', 'zzzzzeed', 'zzzzzeec', 'zzzzzeea', 'zzzzzeeb'] ``` """ diff --git a/keras_nlp/tests/doc_tests/docstring_test.py b/keras_nlp/tests/doc_tests/docstring_test.py index 55f5c2053d..b3129f416e 100644 --- a/keras_nlp/tests/doc_tests/docstring_test.py +++ b/keras_nlp/tests/doc_tests/docstring_test.py @@ -52,10 +52,6 @@ def test_docstrings(): runner = unittest.TextTestRunner() suite = unittest.TestSuite() for module in keras_nlp_modules: - # Temporarily stop testing gpt2 & deberta docstrings until we are - # exporting the symbols. - if "gpt2" in module.__name__ or "deberta_v3" in module.__name__: - continue suite.addTest( doctest.DocTestSuite( module, @@ -98,23 +94,6 @@ def test_fenced_docstrings(): runner = unittest.TextTestRunner() suite = unittest.TestSuite() for module in keras_nlp_modules: - # Do not test certain modules. - if module.__name__ in [ - # Base classes. - "keras_nlp.models.backbone", - "keras_nlp.models.preprocessor", - "keras_nlp.models.task", - "keras_nlp.tokenizers.byte_pair_tokenizer", - "keras_nlp.tokenizers.sentence_piece_tokenizer", - "keras_nlp.tokenizers.word_piece_tokenizer", - # Preprocessors and tokenizers which use `model.spm` (temporary). - "keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor", - "keras_nlp.models.f_net.f_net_preprocessor", - "keras_nlp.models.f_net.f_net_tokenizer", - "keras_nlp.models.t5.t5_tokenizer", - ]: - continue - suite.addTest( doctest.DocTestSuite( module, diff --git a/keras_nlp/tests/doc_tests/fenced_docstring_lib.py b/keras_nlp/tests/doc_tests/fenced_docstring_lib.py index d1911d848d..38819429d5 100644 --- a/keras_nlp/tests/doc_tests/fenced_docstring_lib.py +++ b/keras_nlp/tests/doc_tests/fenced_docstring_lib.py @@ -91,6 +91,11 @@ def get_examples( if re.search("doctest.*skip", match.group(0), re.IGNORECASE): continue + # Do not test any docstring with our format string markers. + # These will not run until formatted. + if re.search("{{", match.group(0)): + continue + groups = match.groupdict() source = textwrap.dedent(groups["doctest"])