Add Fenced Docstring Testing (#640)

abheesht17 · mattdangerw · web-flow · commit 544aca0f2c3a · 2023-01-17T11:57:57.000-08:00
* Add fenced docstring lib

* Fixes

* Use RegEx

* Some UT fixes

* More docstring fixes

* Fix find_files()

* Fix more docstrings

* Fix classifier docstrings

* Fix mistake

* Module-level test collection

* Stop testing Backbone

* Fixes

* Add source

* Conditional import for astor

* Small fixes

* Minor edit

* Fixes

* Add FNet preprocessor to skipped fenced ds

* Remove unused vars

* copy edit

Co-authored-by: Matt Watson &lt;mattdangerw@gmail.com&gt;
diff --git a/keras_nlp/models/albert/albert_tokenizer.py b/keras_nlp/models/albert/albert_tokenizer.py
@@ -59,7 +59,7 @@ class AlbertTokenizer(SentencePieceTokenizer):
     tokenizer("the quick brown fox")
 
     # Detokenization.
-    tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]]))
+    tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
     ```
     """
 
diff --git a/keras_nlp/models/backbone.py b/keras_nlp/models/backbone.py
@@ -51,10 +51,12 @@ def from_preset(
         Examples:
         ```python
         # Load architecture and weights from preset
-        model = {{model_name}}.from_preset("{{example_preset_name}}")
+        model = keras_nlp.models.{{model_name}}.from_preset(
+            "{{example_preset_name}}"
+        )
 
         # Load randomly initialized model from preset architecture
-        model = {{model_name}}.from_preset(
+        model = keras_nlp.models.{{model_name}}.from_preset(
             "{{example_preset_name}}",
             load_weights=False
         )
diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py
@@ -77,7 +77,7 @@ class BertBackbone(Backbone):
     }
 
     # Pretrained BERT encoder
-    model = keras_nlp.models.BertBackbone.from_preset("base_base_en_uncased")
+    model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")
     output = model(input_data)
 
     # Randomly initialized BERT encoder with a custom config
diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py
@@ -64,7 +64,7 @@ class DistilBertClassifier(PipelineModel):
     preprocessed_features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
     }
     labels = [0, 3]
 
@@ -84,6 +84,9 @@ class DistilBertClassifier(PipelineModel):
         num_classes=4,
         preprocessor=None,
     )
+    classifier.compile(
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    )
     classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
 
     # Access backbone programatically (e.g., to change `trainable`)
@@ -218,7 +221,7 @@ def from_preset(
         labels = [0, 3]
 
         # Use a shorter sequence length.
-        preprocessor = keras_nlp.models.DistilBertBackbone.from_preset(
+        preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(
             "distil_bert_base_en_uncased",
             sequence_length=128,
         )
diff --git a/keras_nlp/models/f_net/f_net_tokenizer.py b/keras_nlp/models/f_net/f_net_tokenizer.py
@@ -59,7 +59,7 @@ class FNetTokenizer(SentencePieceTokenizer):
     tokenizer("the quick brown fox")
 
     # Detokenization.
-    tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]]))
+    tokenizer.detokenize(tf.constant([[2, 14, 2231, 886, 2385, 3]]))
     ```
     """
 
diff --git a/keras_nlp/models/roberta/roberta_classifier.py b/keras_nlp/models/roberta/roberta_classifier.py
@@ -197,8 +197,8 @@ def from_preset(
         features = ["The quick brown fox jumped.", "I forgot my homework."]
         labels = [0, 3]
 
-        # Create a RobertClassifier and fit your data.
-        classifier = keras_nlp.models.RobertClassifier.from_preset(
+        # Create a RobertaClassifier and fit your data.
+        classifier = keras_nlp.models.RobertaClassifier.from_preset(
             "roberta_base_en",
             num_classes=4,
         )
@@ -215,13 +215,13 @@ def from_preset(
         labels = [0, 3]
 
         # Use a shorter sequence length.
-        preprocessor = keras_nlp.models.RobertPreprocessor.from_preset(
+        preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(
             "roberta_base_en",
             sequence_length=128,
         )
 
-        # Create a RobertClassifier and fit your data.
-        classifier = keras_nlp.models.RobertClassifier.from_preset(
+        # Create a RobertaClassifier and fit your data.
+        classifier = keras_nlp.models.RobertaClassifier.from_preset(
             "roberta_base_en",
             num_classes=4,
             preprocessor=preprocessor,
@@ -244,7 +244,7 @@ def from_preset(
         labels = [0, 3]
 
         # Create a RoBERTa classifier and fit your data.
-        classifier = keras_nlp.models.RobertClassifier.from_preset(
+        classifier = keras_nlp.models.RobertaClassifier.from_preset(
             "roberta_base_en",
             num_classes=4,
             preprocessor=None,
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
@@ -62,7 +62,7 @@ class XLMRobertaClassifier(PipelineModel):
     preprocessed_features = {
         "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
         "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(1, 12)),
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
     }
     labels = [0, 3]
 
@@ -82,6 +82,9 @@ class XLMRobertaClassifier(PipelineModel):
         num_classes=4,
         preprocessor=None,
     )
+    classifier.compile(
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    )
     classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
 
     # Access backbone programatically (e.g., to change `trainable`)
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
@@ -186,7 +186,7 @@ def from_preset(
         tokenizer("The quick brown fox tripped.")
 
         # Detokenize some input.
-        tokenizer.detokenize([5, 6, 7, 8, 9])
+        tokenizer.detokenize(tf.constant([581, 63773, 119455, 6, 147797]))
         ```
         """
         if preset not in cls.presets:
diff --git a/keras_nlp/tests/doc_tests/docstring_test.py b/keras_nlp/tests/doc_tests/docstring_test.py
@@ -13,17 +13,23 @@
 # limitations under the License.
 
 import doctest
+import io
 import os
 import sys
 import unittest
 
 import numpy as np
 import pytest
+import sentencepiece
 import tensorflow as tf
 from tensorflow import keras
 
 import keras_nlp
 from keras_nlp.tests.doc_tests import docstring_lib
+from keras_nlp.tests.doc_tests import fenced_docstring_lib
+from keras_nlp.tests.doc_tests.fenced_docstring_lib import (
+    astor,  # For checking conditional import.
+)
 
 PACKAGE = "keras_nlp."
 
@@ -37,9 +43,6 @@ def find_modules():
     return keras_nlp_modules
 
 
-@pytest.mark.skipif(
-    sys.platform == "win32", reason="Numpy prints differently on windows"
-)
 def test_docstrings():
     keras_nlp_modules = find_modules()
     # As of this writing, it doesn't seem like pytest support load_tests
@@ -77,3 +80,73 @@ def test_docstrings():
     if not result.wasSuccessful():
         print(result)
     assert result.wasSuccessful()
+
+
+@pytest.mark.extra_large
+@pytest.mark.skipif(
+    astor is None,
+    reason="This test requires `astor`. Please `pip install astor` to run.",
+)
+def test_fenced_docstrings():
+    """Tests fenced code blocks in docstrings.
+
+    This can only be run manually. Run with:
+    `pytest keras_nlp/tests/doc_tests/docstring_test.py --run_extra_large`
+    """
+    keras_nlp_modules = find_modules()
+
+    runner = unittest.TextTestRunner()
+    suite = unittest.TestSuite()
+    for module in keras_nlp_modules:
+        # Temporarily stop testing gpt2 & deberta docstrings until we are
+        # exporting the symbols.
+        if "gpt2" in module.__name__ or "deberta_v3" in module.__name__:
+            continue
+        # Do not test certain modules.
+        if module.__name__ in [
+            # Base classes.
+            "keras_nlp.models.backbone",
+            "keras_nlp.models.preprocessor",
+            # Preprocessors and tokenizers which use `model.spm`.
+            "keras_nlp.models.albert.albert_preprocessor",
+            "keras_nlp.models.albert.albert_tokenizer",
+            "keras_nlp.models.xlm_roberta.xlm_roberta_preprocessor",
+            "keras_nlp.models.f_net.f_net_preprocessor",
+            "keras_nlp.models.f_net.f_net_tokenizer",
+        ]:
+            continue
+
+        suite.addTest(
+            doctest.DocTestSuite(
+                module,
+                test_finder=doctest.DocTestFinder(
+                    exclude_empty=False,
+                    parser=fenced_docstring_lib.FencedCellParser(
+                        fence_label="python"
+                    ),
+                ),
+                globs={
+                    "_print_if_not_none": fenced_docstring_lib._print_if_not_none
+                },
+                extraglobs={
+                    "tf": tf,
+                    "np": np,
+                    "os": os,
+                    "keras": keras,
+                    "keras_nlp": keras_nlp,
+                    "io": io,
+                    "sentencepiece": sentencepiece,
+                },
+                checker=docstring_lib.DoctestOutputChecker(),
+                optionflags=(
+                    doctest.ELLIPSIS
+                    | doctest.NORMALIZE_WHITESPACE
+                    | doctest.IGNORE_EXCEPTION_DETAIL
+                    | doctest.DONT_ACCEPT_BLANKLINE
+                ),
+            )
+        )
+    result = runner.run(suite)
+    if not result.wasSuccessful():
+        print(result)
+    assert result.wasSuccessful()
diff --git a/keras_nlp/tests/doc_tests/fenced_docstring_lib.py b/keras_nlp/tests/doc_tests/fenced_docstring_lib.py
diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer_trainer.py
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py b/keras_nlp/tokenizers/word_piece_tokenizer_trainer.py
diff --git a/requirements-common.txt b/requirements-common.txt

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ class BertBackbone(Backbone):`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`# Pretrained BERT encoder`
`80`		`- model = keras_nlp.models.BertBackbone.from_preset("base_base_en_uncased")`
	`80`	`+ model = keras_nlp.models.BertBackbone.from_preset("bert_base_en_uncased")`
`81`	`81`	`output = model(input_data)`
`82`	`82`
`83`	`83`	`# Randomly initialized BERT encoder with a custom config`