keras-team · mattdangerw · Mar 26, 2024 · Oct 29, 2023 · Oct 29, 2023 · Oct 31, 2023
diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py
@@ -72,6 +72,7 @@
     DistilBertTokenizer,
 )
 from keras_nlp.models.electra.electra_backbone import ElectraBackbone
+from keras_nlp.models.electra.electra_preprocessor import ElectraPreprocessor
 from keras_nlp.models.electra.electra_tokenizer import ElectraTokenizer
 from keras_nlp.models.f_net.f_net_backbone import FNetBackbone
 from keras_nlp.models.f_net.f_net_classifier import FNetClassifier

diff --git a/keras_nlp/models/electra/electra_backbone.py b/keras_nlp/models/electra/electra_backbone.py
@@ -12,13 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
+
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.backend import keras
 from keras_nlp.layers.modeling.position_embedding import PositionEmbedding
 from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding
 from keras_nlp.layers.modeling.transformer_encoder import TransformerEncoder
 from keras_nlp.models.backbone import Backbone
+from keras_nlp.models.electra.electra_presets import backbone_presets
 from keras_nlp.utils.keras_utils import gelu_approximate
+from keras_nlp.utils.python_utils import classproperty
 
 
 def electra_kernel_initializer(stddev=0.02):
@@ -36,8 +40,9 @@ class ElectraBackbone(Backbone):
     or classification task networks.
 
     The default constructor gives a fully customizable, randomly initialized
-    Electra encoder with any number of layers, heads, and embedding
-    dimensions.
+    ELECTRA encoder with any number of layers, heads, and embedding
+    dimensions. To load preset architectures and weights, use the
+    `from_preset()` constructor.
 
     Disclaimer: Pre-trained models are provided on an "as is" basis, without
     warranties or conditions of any kind. The underlying model is provided by a
@@ -70,6 +75,13 @@ class ElectraBackbone(Backbone):
         "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
         "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
         }
+
+        # Pre-trained ELECTRA encoder.
+        model = keras_nlp.models.ElectraBackbone.from_preset(
+            "electra_base_discriminator_en"
+        )
+        model(input_data)
+
         # Randomly initialized Electra encoder
         backbone = keras_nlp.models.ElectraBackbone(
             vocabulary_size=1000,
@@ -234,3 +246,7 @@ def get_config(self):
             }
         )
         return config
+
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy(backbone_presets)
diff --git a/keras_nlp/models/electra/electra_backbone_test.py b/keras_nlp/models/electra/electra_backbone_test.py
@@ -54,3 +54,37 @@ def test_saved_model(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
         )
+
+    @pytest.mark.large
+    def test_smallest_preset(self):
+        self.run_preset_test(
+            cls=ElectraBackbone,
+            preset="electra_small_discriminator_en",
+            input_data={
+                "token_ids": ops.array([[101, 1996, 4248, 102]], dtype="int32"),
+                "segment_ids": ops.zeros((1, 4), dtype="int32"),
+                "padding_mask": ops.ones((1, 4), dtype="int32"),
+            },
+            expected_output_shape={
+                "sequence_output": (1, 4, 256),
+                "pooled_output": (1, 256),
+            },
+            # The forward pass from a preset should be stable!
+            expected_partial_output={
+                "sequence_output": (
+                    ops.array([0.32287, 0.18754, -0.22272, -0.24177, 1.18977])
+                ),
+                "pooled_output": (
+                    ops.array([-0.02974, 0.23383, 0.08430, -0.19471, 0.14822])
+                ),
+            },
+        )
+
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        for preset in ElectraBackbone.presets:
+            self.run_preset_test(
+                cls=ElectraBackbone,
+                preset=preset,
+                input_data=self.input_data,
+            )
diff --git a/keras_nlp/models/electra/electra_preprocessor.py b/keras_nlp/models/electra/electra_preprocessor.py
@@ -0,0 +1,163 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_nlp.api_export import keras_nlp_export
+from keras_nlp.layers.preprocessing.multi_segment_packer import (
+    MultiSegmentPacker,
+)
+from keras_nlp.models.electra.electra_presets import backbone_presets
+from keras_nlp.models.electra.electra_tokenizer import ElectraTokenizer
+from keras_nlp.models.preprocessor import Preprocessor
+from keras_nlp.utils.keras_utils import (
+    convert_inputs_to_list_of_tensor_segments,
+)
+from keras_nlp.utils.keras_utils import pack_x_y_sample_weight
+from keras_nlp.utils.python_utils import classproperty
+
+
+@keras_nlp_export("keras_nlp.models.ElectraPreprocessor")
+class ElectraPreprocessor(Preprocessor):
+    """A ELECTRA preprocessing layer which tokenizes and packs inputs.
+
+    This preprocessing layer will do three things:
+
+     1. Tokenize any number of input segments using the `tokenizer`.
+     2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
+       with the appropriate `"[CLS]"`, `"[SEP]"` and `"[PAD]"` tokens.
+     3. Construct a dictionary of with keys `"token_ids"` and `"padding_mask"`,
+       that can be passed directly to a ELECTRA model.
+
+    This layer can be used directly with `tf.data.Dataset.map` to preprocess
+    string data in the `(x, y, sample_weight)` format used by
+    `keras.Model.fit`.
+
+    Args:
+        tokenizer: A `keras_nlp.models.ElectraTokenizer` instance.
+        sequence_length: The length of the packed inputs.
+        truncate: string. The algorithm to truncate a list of batched segments
+            to fit within `sequence_length`. The value can be either
+            `round_robin` or `waterfall`:
+                - `"round_robin"`: Available space is assigned one token at a
+                    time in a round-robin fashion to the inputs that still need
+                    some, until the limit is reached.
+                - `"waterfall"`: The allocation of the budget is done using a
+                    "waterfall" algorithm that allocates quota in a
+                    left-to-right manner and fills up the buckets until we run
+                    out of budget. It supports an arbitrary number of segments.
+
+    Call arguments:
+        x: A tensor of single string sequences, or a tuple of multiple
+            tensor sequences to be packed together. Inputs may be batched or
+            unbatched. For single sequences, raw python inputs will be converted
+            to tensors. For multiple sequences, pass tensors directly.
+        y: Any label data. Will be passed through unaltered.
+        sample_weight: Any label weight data. Will be passed through unaltered.
+
+    Examples:
+
+    Directly calling the layer on data.
+    ```python
+    preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset(
+        "electra_base_discriminator_en"
+    )
+    preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])
+
+    # Custom vocabulary.
+    vocab = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
+    vocab += ["The", "quick", "brown", "fox", "jumped", "."]
+    tokenizer = keras_nlp.models.ElectraTokenizer(vocabulary=vocab)
+    preprocessor = keras_nlp.models.ElectraPreprocessor(tokenizer)
+    preprocessor("The quick brown fox jumped.")
+    ```
+
+    Mapping with `tf.data.Dataset`.
+    ```python
+    preprocessor = keras_nlp.models.ElectraPreprocessor.from_preset(
+        "electra_base_discriminator_en"
+    )
+
+    first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+    second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+    label = tf.constant([1, 1])
+    # Map labeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices((first, label))
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
+
+    # Map unlabeled single sentences.
+    ds = tf.data.Dataset.from_tensor_slices(first)
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
+    # Map labeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
+    ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+    # Map unlabeled sentence pairs.
+    ds = tf.data.Dataset.from_tensor_slices((first, second))
+
+    # Watch out for tf.data's default unpacking of tuples here!
+    # Best to invoke the `preprocessor` directly in this case.
+    ds = ds.map(
+        lambda first, second: preprocessor(x=(first, second)),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    ```
+    """
+
+    def __init__(
+        self,
+        tokenizer,
+        sequence_length=512,
+        truncate="round_robin",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.tokenizer = tokenizer
+        self.packer = MultiSegmentPacker(
+            start_value=self.tokenizer.cls_token_id,
+            end_value=self.tokenizer.sep_token_id,
+            pad_value=self.tokenizer.pad_token_id,
+            truncate=truncate,
+            sequence_length=sequence_length,
+        )
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sequence_length": self.packer.sequence_length,
+                "truncate": self.packer.truncate,
+            }
+        )
+        return config
+
+    def call(self, x, y=None, sample_weight=None):
+        x = convert_inputs_to_list_of_tensor_segments(x)
+        x = [self.tokenizer(segment) for segment in x]
+        token_ids, segment_ids = self.packer(x)
+        x = {
+            "token_ids": token_ids,
+            "segment_ids": segment_ids,
+            "padding_mask": token_ids != self.tokenizer.pad_token_id,
+        }
+        return pack_x_y_sample_weight(x, y, sample_weight)
+
+    @classproperty
+    def tokenizer_cls(cls):
+        return ElectraTokenizer
+
+    @classproperty
+    def presets(cls):
+        return copy.deepcopy({**backbone_presets})
diff --git a/keras_nlp/models/electra/electra_preprocessor_test.py b/keras_nlp/models/electra/electra_preprocessor_test.py
@@ -0,0 +1,67 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from keras_nlp.models.electra.electra_preprocessor import ElectraPreprocessor
+from keras_nlp.models.electra.electra_tokenizer import ElectraTokenizer
+from keras_nlp.tests.test_case import TestCase
+
+
+class ElectraPreprocessorTest(TestCase):
+    def setUp(self):
+        self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
+        self.vocab += ["THE", "QUICK", "BROWN", "FOX"]
+        self.vocab += ["the", "quick", "brown", "fox"]
+        self.tokenizer = ElectraTokenizer(vocabulary=self.vocab)
+        self.init_kwargs = {
+            "tokenizer": self.tokenizer,
+            "sequence_length": 8,
+        }
+        self.input_data = (
+            ["THE QUICK BROWN FOX."],
+            [1],  # Pass through labels.
+            [1.0],  # Pass through sample_weights.
+        )
+
+    def test_preprocessor_basics(self):
+        self.run_preprocessing_layer_test(
+            cls=ElectraPreprocessor,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output=(
+                {
+                    "token_ids": [[2, 5, 6, 7, 8, 1, 3, 0]],
+                    "segment_ids": [[0, 0, 0, 0, 0, 0, 0, 0]],
+                    "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
+                },
+                [1],  # Pass through labels.
+                [1.0],  # Pass through sample_weights.
+            ),
+        )
+
+    def test_errors_for_2d_list_input(self):
+        preprocessor = ElectraPreprocessor(**self.init_kwargs)
+        ambiguous_input = [["one", "two"], ["three", "four"]]
+        with self.assertRaises(ValueError):
+            preprocessor(ambiguous_input)
+
+    @pytest.mark.extra_large
+    def test_all_presets(self):
+        for preset in ElectraPreprocessor.presets:
+            self.run_preset_test(
+                cls=ElectraPreprocessor,
+                preset=preset,
+                input_data=self.input_data,
+            )
diff --git a/keras_nlp/models/electra/electra_presets.py b/keras_nlp/models/electra/electra_presets.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ELECTRA model preset configurations."""
+
+backbone_presets = {
+    "electra_base_discriminator_en": {
+        "metadata": {
+            "description": (
+                "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+                "This is base discriminator model with 12 layers."
+            ),
+            "params": 109482240,
+            "official_name": "ELECTRA",
+            "path": "electra",
+            "model_card": "https://github.com/google-research/electra",
+        },
+        "kaggle_handle": "kaggle://pranavprajapati16/electra/keras/electra_base_discriminator_en/1",
+    },
+    "electra_small_discriminator_en": {
+        "metadata": {
+            "description": (
+                "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+                "This is small discriminator model with 12 layers."
+            ),
+            "params": 13548800,
+            "official_name": "ELECTRA",
+            "path": "electra",
+            "model_card": "https://github.com/google-research/electra",
+        },
+        "kaggle_handle": "kaggle://pranavprajapati16/electra/keras/electra_small_discriminator_en/1",
+    },
+    "electra_small_generator_en": {
+        "metadata": {
+            "description": (
+                "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+                "This is small generator model with 12 layers."
+            ),
+            "params": 13548800,
+            "official_name": "ELECTRA",
+            "path": "electra",
+            "model_card": "https://github.com/google-research/electra",
+        },
+        "kaggle_handle": "kaggle://pranavprajapati16/electra/keras/electra_small_generator_en/1",
+    },
+    "electra_base_generator_en": {
+        "metadata": {
+            "description": (
+                "ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators"
+                "This is base generator model with 12 layers."
+            ),
+            "params": 33576960,
+            "official_name": "ELECTRA",
+            "path": "electra",
+            "model_card": "https://github.com/google-research/electra",
+        },
+        "kaggle_handle": "kaggle://pranavprajapati16/electra/keras/electra_base_generator_en/1",
+    },
+}