Skip to content

Commit 544aca0

Browse files
Add Fenced Docstring Testing (#640)
* Add fenced docstring lib * Fixes * Use RegEx * Some UT fixes * More docstring fixes * Fix find_files() * Fix more docstrings * Fix classifier docstrings * Fix mistake * Module-level test collection * Stop testing Backbone * Fixes * Add source * Conditional import for astor * Small fixes * Minor edit * Fixes * Add FNet preprocessor to skipped fenced ds * Remove unused vars * copy edit Co-authored-by: Matt Watson <[email protected]>
1 parent 665174f commit 544aca0

File tree

13 files changed

+303
-24
lines changed

13 files changed

+303
-24
lines changed

keras_nlp/models/albert/albert_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class AlbertTokenizer(SentencePieceTokenizer):
5959
tokenizer("the quick brown fox")
6060
6161
# Detokenization.
62-
tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]]))
62+
tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
6363
```
6464
"""
6565

keras_nlp/models/backbone.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,12 @@ def from_preset(
5151
Examples:
5252
```python
5353
# Load architecture and weights from preset
54-
model = {{model_name}}.from_preset("{{example_preset_name}}")
54+
model = keras_nlp.models.{{model_name}}.from_preset(
55+
"{{example_preset_name}}"
56+
)
5557
5658
# Load randomly initialized model from preset architecture
57-
model = {{model_name}}.from_preset(
59+
model = keras_nlp.models.{{model_name}}.from_preset(
5860
"{{example_preset_name}}",
5961
load_weights=False
6062
)

keras_nlp/models/bert/bert_backbone.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class BertBackbone(Backbone):
7777
}
7878
7979
# Pretrained BERT encoder
80-
model = keras_nlp.models.BertBackbone.from_preset("base_base_en_uncased")
80+
model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")
8181
output = model(input_data)
8282
8383
# Randomly initialized BERT encoder with a custom config

keras_nlp/models/distil_bert/distil_bert_classifier.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ class DistilBertClassifier(PipelineModel):
6464
preprocessed_features = {
6565
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
6666
"padding_mask": tf.constant(
67-
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
67+
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
6868
}
6969
labels = [0, 3]
7070
@@ -84,6 +84,9 @@ class DistilBertClassifier(PipelineModel):
8484
num_classes=4,
8585
preprocessor=None,
8686
)
87+
classifier.compile(
88+
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
89+
)
8790
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
8891
8992
# Access backbone programatically (e.g., to change `trainable`)
@@ -218,7 +221,7 @@ def from_preset(
218221
labels = [0, 3]
219222
220223
# Use a shorter sequence length.
221-
preprocessor = keras_nlp.models.DistilBertBackbone.from_preset(
224+
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
222225
"distil_bert_base_en_uncased",
223226
sequence_length=128,
224227
)

keras_nlp/models/f_net/f_net_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ class FNetTokenizer(SentencePieceTokenizer):
5959
tokenizer("the quick brown fox")
6060
6161
# Detokenization.
62-
tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]]))
62+
tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
6363
```
6464
"""
6565

keras_nlp/models/roberta/roberta_classifier.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -197,8 +197,8 @@ def from_preset(
197197
features = ["The quick brown fox jumped.", "I forgot my homework."]
198198
labels = [0, 3]
199199
200-
# Create a RobertClassifier and fit your data.
201-
classifier = keras_nlp.models.RobertClassifier.from_preset(
200+
# Create a RobertaClassifier and fit your data.
201+
classifier = keras_nlp.models.RobertaClassifier.from_preset(
202202
"roberta_base_en",
203203
num_classes=4,
204204
)
@@ -215,13 +215,13 @@ def from_preset(
215215
labels = [0, 3]
216216
217217
# Use a shorter sequence length.
218-
preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
218+
preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(
219219
"roberta_base_en",
220220
sequence_length=128,
221221
)
222222
223-
# Create a RobertClassifier and fit your data.
224-
classifier = keras_nlp.models.RobertClassifier.from_preset(
223+
# Create a RobertaClassifier and fit your data.
224+
classifier = keras_nlp.models.RobertaClassifier.from_preset(
225225
"roberta_base_en",
226226
num_classes=4,
227227
preprocessor=preprocessor,
@@ -244,7 +244,7 @@ def from_preset(
244244
labels = [0, 3]
245245
246246
# Create a RoBERTa classifier and fit your data.
247-
classifier = keras_nlp.models.RobertClassifier.from_preset(
247+
classifier = keras_nlp.models.RobertaClassifier.from_preset(
248248
"roberta_base_en",
249249
num_classes=4,
250250
preprocessor=None,

keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ class XLMRobertaClassifier(PipelineModel):
6262
preprocessed_features = {
6363
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
6464
"padding_mask": tf.constant(
65-
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
65+
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
6666
}
6767
labels = [0, 3]
6868
@@ -82,6 +82,9 @@ class XLMRobertaClassifier(PipelineModel):
8282
num_classes=4,
8383
preprocessor=None,
8484
)
85+
classifier.compile(
86+
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
87+
)
8588
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
8689
8790
# Access backbone programatically (e.g., to change `trainable`)

keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def from_preset(
186186
tokenizer("The quick brown fox tripped.")
187187
188188
# Detokenize some input.
189-
tokenizer.detokenize([5, 6, 7, 8, 9])
189+
tokenizer.detokenize(tf.constant([581, 63773, 119455, 6, 147797]))
190190
```
191191
"""
192192
if preset not in cls.presets:

keras_nlp/tests/doc_tests/docstring_test.py

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,23 @@
1313
# limitations under the License.
1414

1515
import doctest
16+
import io
1617
import os
1718
import sys
1819
import unittest
1920

2021
import numpy as np
2122
import pytest
23+
import sentencepiece
2224
import tensorflow as tf
2325
from tensorflow import keras
2426

2527
import keras_nlp
2628
from keras_nlp.tests.doc_tests import docstring_lib
29+
from keras_nlp.tests.doc_tests import fenced_docstring_lib
30+
from keras_nlp.tests.doc_tests.fenced_docstring_lib import (
31+
astor, # For checking conditional import.
32+
)
2733

2834
PACKAGE = "keras_nlp."
2935

@@ -37,9 +43,6 @@ def find_modules():
3743
return keras_nlp_modules
3844

3945

40-
@pytest.mark.skipif(
41-
sys.platform == "win32", reason="Numpy prints differently on windows"
42-
)
4346
def test_docstrings():
4447
keras_nlp_modules = find_modules()
4548
# As of this writing, it doesn't seem like pytest support load_tests
@@ -77,3 +80,73 @@ def test_docstrings():
7780
if not result.wasSuccessful():
7881
print(result)
7982
assert result.wasSuccessful()
83+
84+
85+
@pytest.mark.extra_large
86+
@pytest.mark.skipif(
87+
astor is None,
88+
reason="This test requires `astor`. Please `pip install astor` to run.",
89+
)
90+
def test_fenced_docstrings():
91+
"""Tests fenced code blocks in docstrings.
92+
93+
This can only be run manually. Run with:
94+
`pytest keras_nlp/tests/doc_tests/docstring_test.py --run_extra_large`
95+
"""
96+
keras_nlp_modules = find_modules()
97+
98+
runner = unittest.TextTestRunner()
99+
suite = unittest.TestSuite()
100+
for module in keras_nlp_modules:
101+
# Temporarily stop testing gpt2 & deberta docstrings until we are
102+
# exporting the symbols.
103+
if "gpt2" in module.__name__ or "deberta_v3" in module.__name__:
104+
continue
105+
# Do not test certain modules.
106+
if module.__name__ in [
107+
# Base classes.
108+
"keras_nlp.models.backbone",
109+
"keras_nlp.models.preprocessor",
110+
# Preprocessors and tokenizers which use `model.spm`.
111+
"keras_nlp.models.albert.albert_preprocessor",
112+
"keras_nlp.models.albert.albert_tokenizer",
113+
"keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor",
114+
"keras_nlp.models.f_net.f_net_preprocessor",
115+
"keras_nlp.models.f_net.f_net_tokenizer",
116+
]:
117+
continue
118+
119+
suite.addTest(
120+
doctest.DocTestSuite(
121+
module,
122+
test_finder=doctest.DocTestFinder(
123+
exclude_empty=False,
124+
parser=fenced_docstring_lib.FencedCellParser(
125+
fence_label="python"
126+
),
127+
),
128+
globs={
129+
"_print_if_not_none": fenced_docstring_lib._print_if_not_none
130+
},
131+
extraglobs={
132+
"tf": tf,
133+
"np": np,
134+
"os": os,
135+
"keras": keras,
136+
"keras_nlp": keras_nlp,
137+
"io": io,
138+
"sentencepiece": sentencepiece,
139+
},
140+
checker=docstring_lib.DoctestOutputChecker(),
141+
optionflags=(
142+
doctest.ELLIPSIS
143+
| doctest.NORMALIZE_WHITESPACE
144+
| doctest.IGNORE_EXCEPTION_DETAIL
145+
| doctest.DONT_ACCEPT_BLANKLINE
146+
),
147+
)
148+
)
149+
result = runner.run(suite)
150+
if not result.wasSuccessful():
151+
print(result)
152+
assert result.wasSuccessful()

0 commit comments

Comments
 (0)