From 6548c512cdef8c2d5f006b236e3ec9a8e1f85496 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Tue, 18 Apr 2023 22:00:25 -0700 Subject: [PATCH] Misc fixes to docstrings Another round of style edits, and breakage fixes for docstrings. With these changes, all our fenced docstrings now run across the repo! Though it does take 20 minutes on a fairly large machine. --- .../distil_bert/distil_bert_tokenizer.py | 3 +++ keras_nlp/models/f_net/f_net_preprocessor.py | 10 ++------- keras_nlp/models/preprocessor.py | 10 +-------- .../roberta/roberta_masked_lm_preprocessor.py | 2 ++ keras_nlp/models/t5/t5_tokenizer.py | 22 +++++++++++++++++-- .../xlm_roberta_masked_lm_preprocessor.py | 2 ++ .../xlm_roberta/xlm_roberta_preprocessor.py | 2 ++ keras_nlp/samplers/beam_sampler.py | 14 +++++++----- keras_nlp/tests/doc_tests/docstring_test.py | 21 ------------------ .../tests/doc_tests/fenced_docstring_lib.py | 5 +++++ 10 files changed, 45 insertions(+), 46 deletions(-) diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py index f954925ce3..94efdd523e 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py @@ -56,10 +56,13 @@ class DistilBertTokenizer(WordPieceTokenizer): "distil_bert_base_en_uncased", ) tokenizer("The quick brown fox jumped.") + # Batched input. tokenizer(["The quick brown fox jumped.", "The fox slept."]) + # Detokenization. tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) + # Custom vocabulary. vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"] vocab += ["The", "quick", "brown", "fox", "jumped", "."] diff --git a/keras_nlp/models/f_net/f_net_preprocessor.py b/keras_nlp/models/f_net/f_net_preprocessor.py index 21c6ec7f1b..731dd96548 100644 --- a/keras_nlp/models/f_net/f_net_preprocessor.py +++ b/keras_nlp/models/f_net/f_net_preprocessor.py @@ -69,10 +69,8 @@ class FNetPreprocessor(Preprocessor): Directly calling the from_preset(). ```python - tokenizer = keras_nlp.models.FNetTokenizer(proto="model.spm") - preprocessor = keras_nlp.models.FNetPreprocessor( - tokenizer=tokenizer, - sequence_length=10, + preprocessor = keras_nlp.models.FNetPreprocessor.from_preset( + "f_net_base_en" ) # Tokenize and pack a single sentence. @@ -86,10 +84,6 @@ class FNetPreprocessor(Preprocessor): first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) second = tf.constant(["The fox tripped.", "Oh look, a whale."]) preprocessor((first, second)) - - - preprocessor = keras_nlp.models.FNetPreprocessor(tokenizer) - preprocessor("The quick brown fox jumped.") ``` Mapping with `tf.data.Dataset`. diff --git a/keras_nlp/models/preprocessor.py b/keras_nlp/models/preprocessor.py index 70b2e72b70..bc70817ed7 100644 --- a/keras_nlp/models/preprocessor.py +++ b/keras_nlp/models/preprocessor.py @@ -67,18 +67,10 @@ def from_preset( Examples: ```python - # Load preprocessor from preset + # Load a preprocessor layer from a preset. preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( "{{example_preset_name}}", ) - preprocessor("The quick brown fox jumped.") - - # Override sequence_length - preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset( - "{{example_preset_name}}", - sequence_length=64 - ) - preprocessor("The quick brown fox jumped.") ``` """ if not cls.presets: diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py index 6a7b76d257..0ea8df5a36 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py @@ -75,6 +75,8 @@ class RobertaMaskedLMPreprocessor(RobertaPreprocessor): generates label weights. Examples: + + Directly calling the layer on data. ```python # Load the preprocessor from a preset. preprocessor = keras_nlp.models.RobertaMaskedLMPreprocessor.from_preset( diff --git a/keras_nlp/models/t5/t5_tokenizer.py b/keras_nlp/models/t5/t5_tokenizer.py index d6c058c577..5eb2437b5d 100644 --- a/keras_nlp/models/t5/t5_tokenizer.py +++ b/keras_nlp/models/t5/t5_tokenizer.py @@ -43,7 +43,25 @@ class T5Tokenizer(SentencePieceTokenizer): Examples: ```python - tokenizer = keras_nlp.models.T5Tokenizer(proto="model.spm") + bytes_io = io.BytesIO() + ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."]) + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=ds.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=8, + model_type="WORD", + bos_id=-1, + pad_id=0, + eos_id=1, + unk_id=2, + pad_piece="", + eos_piece="", + unk_piece="", + ) + tokenizer = keras_nlp.models.T5Tokenizer( + proto=bytes_io.getvalue(), + ) + tokenizer("The quick brown fox jumped.") # Batched inputs. tokenizer(["the quick brown fox", "the earth is round"]) @@ -52,7 +70,7 @@ class T5Tokenizer(SentencePieceTokenizer): tokenizer("the quick brown fox") # Detokenization. - tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]])) + tokenizer.detokenize(tokenizer("The quick brown fox jumped.")) ``` """ diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py index 2ed7baa0bf..81d94f8de3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm_preprocessor.py @@ -77,6 +77,8 @@ class XLMRobertaMaskedLMPreprocessor(XLMRobertaPreprocessor): generates label weights. Examples: + + Directly calling the layer on data. ```python # Load the preprocessor from a preset. preprocessor = keras_nlp.models.XLMRobertaMaskedLMPreprocessor.from_preset( diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index 6ecb5016e7..122c372c01 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -74,6 +74,8 @@ class XLMRobertaPreprocessor(Preprocessor): sample_weight: Any label weight data. Will be passed through unaltered. Examples: + + Directly calling the layer on data. ```python preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( "xlm_roberta_base_multi" diff --git a/keras_nlp/samplers/beam_sampler.py b/keras_nlp/samplers/beam_sampler.py index 2830640d7e..a3970223b1 100644 --- a/keras_nlp/samplers/beam_sampler.py +++ b/keras_nlp/samplers/beam_sampler.py @@ -73,22 +73,24 @@ def next(prompt, cache, index): char_lookup = {v: k for k, v in int_lookup.items()} batch_size, length, vocab_size = 1, 8, len(int_lookup) - def next(prompt, state, index): + def next(prompt, cache, index): + prompt_batch_size = tf.shape(prompt)[0] + hidden_states = tf.ones((prompt_batch_size, 10)) # A uniform distribution over our alphabet. logits = tf.ones((batch_size, vocab_size)) - return logits, state + return logits, hidden_states, cache - output = keras_nlp.samplers.BeamSampler(return_all_beams=True)( + beams, probs = keras_nlp.samplers.BeamSampler(return_all_beams=True)( next=next, prompt=tf.fill((batch_size, length,), char_lookup['z']), index=5, ) - print(output[0].shape) + print(beams.shape) # >>> (1, 5, 8) - print(output[1].shape) + print(probs.shape) # >>> (1, 5) - print(["".join([int_lookup[i] for i in s]) for s in output.numpy()]) + print(["".join([int_lookup[i] for i in s]) for s in beams[0].numpy()]) # >>> ['zzzzzeee', 'zzzzzeed', 'zzzzzeec', 'zzzzzeea', 'zzzzzeeb'] ``` """ diff --git a/keras_nlp/tests/doc_tests/docstring_test.py b/keras_nlp/tests/doc_tests/docstring_test.py index 55f5c2053d..b3129f416e 100644 --- a/keras_nlp/tests/doc_tests/docstring_test.py +++ b/keras_nlp/tests/doc_tests/docstring_test.py @@ -52,10 +52,6 @@ def test_docstrings(): runner = unittest.TextTestRunner() suite = unittest.TestSuite() for module in keras_nlp_modules: - # Temporarily stop testing gpt2 & deberta docstrings until we are - # exporting the symbols. - if "gpt2" in module.__name__ or "deberta_v3" in module.__name__: - continue suite.addTest( doctest.DocTestSuite( module, @@ -98,23 +94,6 @@ def test_fenced_docstrings(): runner = unittest.TextTestRunner() suite = unittest.TestSuite() for module in keras_nlp_modules: - # Do not test certain modules. - if module.__name__ in [ - # Base classes. - "keras_nlp.models.backbone", - "keras_nlp.models.preprocessor", - "keras_nlp.models.task", - "keras_nlp.tokenizers.byte_pair_tokenizer", - "keras_nlp.tokenizers.sentence_piece_tokenizer", - "keras_nlp.tokenizers.word_piece_tokenizer", - # Preprocessors and tokenizers which use `model.spm` (temporary). - "keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor", - "keras_nlp.models.f_net.f_net_preprocessor", - "keras_nlp.models.f_net.f_net_tokenizer", - "keras_nlp.models.t5.t5_tokenizer", - ]: - continue - suite.addTest( doctest.DocTestSuite( module, diff --git a/keras_nlp/tests/doc_tests/fenced_docstring_lib.py b/keras_nlp/tests/doc_tests/fenced_docstring_lib.py index d1911d848d..38819429d5 100644 --- a/keras_nlp/tests/doc_tests/fenced_docstring_lib.py +++ b/keras_nlp/tests/doc_tests/fenced_docstring_lib.py @@ -91,6 +91,11 @@ def get_examples( if re.search("doctest.*skip", match.group(0), re.IGNORECASE): continue + # Do not test any docstring with our format string markers. + # These will not run until formatted. + if re.search("{{", match.group(0)): + continue + groups = match.groupdict() source = textwrap.dedent(groups["doctest"])