Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion keras_nlp/models/albert/albert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class AlbertTokenizer(SentencePieceTokenizer):
tokenizer("the quick brown fox")

# Detokenization.
tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]]))
tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
```
"""

Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/bert/bert_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class BertBackbone(Backbone):
}

# Pretrained BERT encoder
model = keras_nlp.models.BertBackbone.from_preset("base_base_en_uncased")
model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lol, good we are running this :)

output = model(input_data)

# Randomly initialized BERT encoder with a custom config
Expand Down
7 changes: 5 additions & 2 deletions keras_nlp/models/distil_bert/distil_bert_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class DistilBertClassifier(PipelineModel):
preprocessed_features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
}
labels = [0, 3]

Expand All @@ -84,6 +84,9 @@ class DistilBertClassifier(PipelineModel):
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)

# Access backbone programatically (e.g., to change `trainable`)
Expand Down Expand Up @@ -218,7 +221,7 @@ def from_preset(
labels = [0, 3]

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertBackbone.from_preset(
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
"distil_bert_base_en_uncased",
sequence_length=128,
)
Expand Down
12 changes: 6 additions & 6 deletions keras_nlp/models/roberta/roberta_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,8 +197,8 @@ def from_preset(
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

# Create a RobertClassifier and fit your data.
classifier = keras_nlp.models.RobertClassifier.from_preset(
# Create a RobertaClassifier and fit your data.
classifier = keras_nlp.models.RobertaClassifier.from_preset(
"roberta_base_en",
num_classes=4,
)
Expand All @@ -215,13 +215,13 @@ def from_preset(
labels = [0, 3]

# Use a shorter sequence length.
preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(
"roberta_base_en",
sequence_length=128,
)

# Create a RobertClassifier and fit your data.
classifier = keras_nlp.models.RobertClassifier.from_preset(
# Create a RobertaClassifier and fit your data.
classifier = keras_nlp.models.RobertaClassifier.from_preset(
"roberta_base_en",
num_classes=4,
preprocessor=preprocessor,
Expand All @@ -244,7 +244,7 @@ def from_preset(
labels = [0, 3]

# Create a RoBERTa classifier and fit your data.
classifier = keras_nlp.models.RobertClassifier.from_preset(
classifier = keras_nlp.models.RobertaClassifier.from_preset(
"roberta_base_en",
num_classes=4,
preprocessor=None,
Expand Down
4 changes: 2 additions & 2 deletions keras_nlp/models/roberta/roberta_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,13 @@ def from_preset(
Examples:
```python
# Load preprocessor from preset
preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(
"roberta_base_en",
)
preprocessor("The quick brown fox jumped.")

# Override sequence_length
preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(
"roberta_base_en",
sequence_length=64
)
Expand Down
5 changes: 4 additions & 1 deletion keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class XLMRobertaClassifier(PipelineModel):
preprocessed_features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
}
labels = [0, 3]

Expand All @@ -82,6 +82,9 @@ class XLMRobertaClassifier(PipelineModel):
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)

# Access backbone programatically (e.g., to change `trainable`)
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def from_preset(
tokenizer("The quick brown fox tripped.")

# Detokenize some input.
tokenizer.detokenize([5, 6, 7, 8, 9])
tokenizer.detokenize(tf.constant([581, 63773, 119455, 6, 147797]))
```
"""
if preset not in cls.presets:
Expand Down
73 changes: 73 additions & 0 deletions keras_nlp/tests/doc_tests/docstring_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,24 @@
# limitations under the License.

import doctest
import io
import os
import re
import sys
import unittest

import numpy as np
import pytest
import sentencepiece
import tensorflow as tf
from tensorflow import keras

import keras_nlp
from keras_nlp.tests.doc_tests import docstring_lib
from keras_nlp.tests.doc_tests import fenced_docstring_lib

PACKAGE = "keras_nlp."
DIRECTORY = "keras_nlp"


def find_modules():
Expand All @@ -37,6 +42,18 @@ def find_modules():
return keras_nlp_modules


def find_files(regex_pattern=None):
py_files = []
for root, dirs, files in os.walk(DIRECTORY):
for file in files:
file = os.path.join(root, file)
if file.endswith(".py"):
if regex_pattern is not None and regex_pattern.search(file):
continue
py_files.append(file)
return py_files


@pytest.mark.skipif(
sys.platform == "win32", reason="Numpy prints differently on windows"
)
Expand Down Expand Up @@ -77,3 +94,59 @@ def test_docstrings():
if not result.wasSuccessful():
print(result)
assert result.wasSuccessful()


@pytest.mark.skipif(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can probably remove this case everywhere, we aren't supporting win32 natively anymore (only through WSL).

sys.platform == "win32", reason="Numpy prints differently on windows"
)
def test_fenced_docstrings():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should probably mark this a large test, this will involve a lot of file downloads for the preset right?


regex_pattern = re.compile(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a pattern to skip? If so we should probably name/comment this to make it clearer.

I wonder if there is an annotation we could add that would make this more flexible (and work for both types of docstrings). @do_not_test_docstring or something like that.

r"|".join(
[
# Endswith patterns
"test\\.py$",
"__init__\\.py$",
# Whole string matching
"^keras_nlp/models/backbone\\.py$",
"^keras_nlp/models/preprocessor\\.py$",
"^keras_nlp/models/task\\.py$",
# Unexported symbols
"deberta_v3",
"gpt2",
]
)
)
keras_nlp_files = find_files(regex_pattern=regex_pattern)
runner = unittest.TextTestRunner()
suite = unittest.TestSuite()

suite.addTest(
doctest.DocFileSuite(
*keras_nlp_files,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you said you were going to look into doing this via modules and not files, is that still possible? What was the outcome of checking this out?

module_relative=False,
parser=fenced_docstring_lib.FencedCellParser(fence_label="python"),
globs={
"_print_if_not_none": fenced_docstring_lib._print_if_not_none,
"tf": tf,
"np": np,
"os": os,
"keras": keras,
"keras_nlp": keras_nlp,
"io": io,
"sentencepiece": sentencepiece,
},
checker=fenced_docstring_lib.FencedCellOutputChecker(),
optionflags=(
doctest.ELLIPSIS
| doctest.NORMALIZE_WHITESPACE
| doctest.IGNORE_EXCEPTION_DETAIL
| doctest.DONT_ACCEPT_BLANKLINE
),
)
)

result = runner.run(suite)
if not result.wasSuccessful():
print(result)
assert result.wasSuccessful()
Loading