Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions keras_nlp/models/distil_bert/distil_bert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,13 @@ class DistilBertTokenizer(WordPieceTokenizer):
"distil_bert_base_en_uncased",
)
tokenizer("The quick brown fox jumped.")

# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])

# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# Custom vocabulary.
vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
vocab += ["The", "quick", "brown", "fox", "jumped", "."]
Expand Down
10 changes: 2 additions & 8 deletions keras_nlp/models/f_net/f_net_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,8 @@ class FNetPreprocessor(Preprocessor):

Directly calling the from_preset().
```python
tokenizer = keras_nlp.models.FNetTokenizer(proto="model.spm")
preprocessor = keras_nlp.models.FNetPreprocessor(
tokenizer=tokenizer,
sequence_length=10,
preprocessor = keras_nlp.models.FNetPreprocessor.from_preset(
"f_net_base_en"
)

# Tokenize and pack a single sentence.
Expand All @@ -86,10 +84,6 @@ class FNetPreprocessor(Preprocessor):
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
preprocessor((first, second))


preprocessor = keras_nlp.models.FNetPreprocessor(tokenizer)
preprocessor("The quick brown fox jumped.")
```

Mapping with `tf.data.Dataset`.
Expand Down
10 changes: 1 addition & 9 deletions keras_nlp/models/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,18 +67,10 @@ def from_preset(

Examples:
```python
# Load preprocessor from preset
# Load a preprocessor layer from a preset.
preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
"{{example_preset_name}}",
)
preprocessor("The quick brown fox jumped.")

# Override sequence_length
preprocessor = keras_nlp.models.{{preprocessor_name}}.from_preset(
"{{example_preset_name}}",
sequence_length=64
)
preprocessor("The quick brown fox jumped.")
```
"""
if not cls.presets:
Expand Down
2 changes: 2 additions & 0 deletions keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ class RobertaMaskedLMPreprocessor(RobertaPreprocessor):
generates label weights.

Examples:

Directly calling the layer on data.
```python
# Load the preprocessor from a preset.
preprocessor = keras_nlp.models.RobertaMaskedLMPreprocessor.from_preset(
Expand Down
22 changes: 20 additions & 2 deletions keras_nlp/models/t5/t5_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,25 @@ class T5Tokenizer(SentencePieceTokenizer):
Examples:

```python
tokenizer = keras_nlp.models.T5Tokenizer(proto="model.spm")
bytes_io = io.BytesIO()
ds = tf.data.Dataset.from_tensor_slices(["The quick brown fox jumped."])
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=ds.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=8,
model_type="WORD",
bos_id=-1,
pad_id=0,
eos_id=1,
unk_id=2,
pad_piece="<pad>",
eos_piece="</s>",
unk_piece="<unk>",
)
tokenizer = keras_nlp.models.T5Tokenizer(
proto=bytes_io.getvalue(),
)
tokenizer("The quick brown fox jumped.")

# Batched inputs.
tokenizer(["the quick brown fox", "the earth is round"])
Expand All @@ -52,7 +70,7 @@ class T5Tokenizer(SentencePieceTokenizer):
tokenizer("the quick brown fox")

# Detokenization.
tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
```
"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ class XLMRobertaMaskedLMPreprocessor(XLMRobertaPreprocessor):
generates label weights.

Examples:

Directly calling the layer on data.
```python
# Load the preprocessor from a preset.
preprocessor = keras_nlp.models.XLMRobertaMaskedLMPreprocessor.from_preset(
Expand Down
2 changes: 2 additions & 0 deletions keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ class XLMRobertaPreprocessor(Preprocessor):
sample_weight: Any label weight data. Will be passed through unaltered.

Examples:

Directly calling the layer on data.
```python
preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
"xlm_roberta_base_multi"
Expand Down
14 changes: 8 additions & 6 deletions keras_nlp/samplers/beam_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,24 @@ def next(prompt, cache, index):
char_lookup = {v: k for k, v in int_lookup.items()}
batch_size, length, vocab_size = 1, 8, len(int_lookup)

def next(prompt, state, index):
def next(prompt, cache, index):
prompt_batch_size = tf.shape(prompt)[0]
hidden_states = tf.ones((prompt_batch_size, 10))
# A uniform distribution over our alphabet.
logits = tf.ones((batch_size, vocab_size))
return logits, state
return logits, hidden_states, cache

output = keras_nlp.samplers.BeamSampler(return_all_beams=True)(
beams, probs = keras_nlp.samplers.BeamSampler(return_all_beams=True)(
next=next,
prompt=tf.fill((batch_size, length,), char_lookup['z']),
index=5,
)

print(output[0].shape)
print(beams.shape)
# >>> (1, 5, 8)
print(output[1].shape)
print(probs.shape)
# >>> (1, 5)
print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
print(["".join([int_lookup[i] for i in s]) for s in beams[0].numpy()])
# >>> ['zzzzzeee', 'zzzzzeed', 'zzzzzeec', 'zzzzzeea', 'zzzzzeeb']
```
"""
Expand Down
21 changes: 0 additions & 21 deletions keras_nlp/tests/doc_tests/docstring_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ def test_docstrings():
runner = unittest.TextTestRunner()
suite = unittest.TestSuite()
for module in keras_nlp_modules:
# Temporarily stop testing gpt2 & deberta docstrings until we are
# exporting the symbols.
if "gpt2" in module.__name__ or "deberta_v3" in module.__name__:
continue
suite.addTest(
doctest.DocTestSuite(
module,
Expand Down Expand Up @@ -98,23 +94,6 @@ def test_fenced_docstrings():
runner = unittest.TextTestRunner()
suite = unittest.TestSuite()
for module in keras_nlp_modules:
# Do not test certain modules.
if module.__name__ in [
# Base classes.
"keras_nlp.models.backbone",
"keras_nlp.models.preprocessor",
"keras_nlp.models.task",
"keras_nlp.tokenizers.byte_pair_tokenizer",
"keras_nlp.tokenizers.sentence_piece_tokenizer",
"keras_nlp.tokenizers.word_piece_tokenizer",
# Preprocessors and tokenizers which use `model.spm` (temporary).
"keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor",
"keras_nlp.models.f_net.f_net_preprocessor",
"keras_nlp.models.f_net.f_net_tokenizer",
"keras_nlp.models.t5.t5_tokenizer",
]:
continue

suite.addTest(
doctest.DocTestSuite(
module,
Expand Down
5 changes: 5 additions & 0 deletions keras_nlp/tests/doc_tests/fenced_docstring_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@ def get_examples(
if re.search("doctest.*skip", match.group(0), re.IGNORECASE):
continue

# Do not test any docstring with our format string markers.
# These will not run until formatted.
if re.search("{{", match.group(0)):
continue

groups = match.groupdict()

source = textwrap.dedent(groups["doctest"])
Expand Down