Address comments

mattdangerw · mattdangerw · commit 270991a87b69 · 2023-01-27T17:02:27.000-08:00
diff --git a/keras_nlp/models/roberta/roberta_masked_lm.py b/keras_nlp/models/roberta/roberta_masked_lm.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""RoBERTa classification model."""
+"""RoBERTa masked lm model."""
 
 import copy
 
@@ -63,13 +63,13 @@ class RobertaMaskedLM(Task):
 
     # Create a RobertaMaskedLM with a pretrained backbone and further train
     # on an MLM task.
-    classifier = keras_nlp.models.RobertaMaskedLM.from_preset(
+    masked_lm = keras_nlp.models.RobertaMaskedLM.from_preset(
         "roberta_base_en",
     )
-    classifier.compile(
+    masked_lm.compile(
         loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
     )
-    classifier.fit(x=features, batch_size=2)
+    masked_lm.fit(x=features, batch_size=2)
     ```
 
     Preprocessed inputs and custom backbone.
diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor.py
@@ -14,26 +14,16 @@
 
 """RoBERTa masked language model preprocessor layer."""
 
-import copy
-
+from absl import logging
 from tensorflow import keras
 
 from keras_nlp.layers.masked_lm_mask_generator import MaskedLMMaskGenerator
-from keras_nlp.models.preprocessor import Preprocessor
-from keras_nlp.models.roberta.roberta_multi_segment_packer import (
-    RobertaMultiSegmentPacker,
-)
-from keras_nlp.models.roberta.roberta_presets import backbone_presets
-from keras_nlp.models.roberta.roberta_tokenizer import RobertaTokenizer
-from keras_nlp.utils.keras_utils import (
-    convert_inputs_to_list_of_tensor_segments,
-)
+from keras_nlp.models.roberta.roberta_preprocessor import RobertaPreprocessor
 from keras_nlp.utils.keras_utils import pack_x_y_sample_weight
-from keras_nlp.utils.python_utils import classproperty
 
 
 @keras.utils.register_keras_serializable(package="keras_nlp")
-class RobertaMaskedLMPreprocessor(Preprocessor):
+class RobertaMaskedLMPreprocessor(RobertaPreprocessor):
     """RoBERTa preprocessing for the masked language modeling task.
 
     This preprocessing layer will prepare inputs for a masked language modeling
@@ -114,21 +104,18 @@ def __init__(
         self,
         tokenizer,
         sequence_length=512,
+        truncate="round_robin",
         mask_selection_rate=0.15,
         mask_selection_length=96,
-        truncate="round_robin",
         **kwargs,
     ):
-        super().__init__(**kwargs)
-
-        self._tokenizer = tokenizer
-        self.packer = RobertaMultiSegmentPacker(
-            start_value=tokenizer.start_token_id,
-            end_value=tokenizer.end_token_id,
-            pad_value=tokenizer.pad_token_id,
-            truncate=truncate,
+        super().__init__(
+            tokenizer,
             sequence_length=sequence_length,
+            truncate=truncate,
+            **kwargs,
         )
+
         self.masker = MaskedLMMaskGenerator(
             mask_selection_rate=mask_selection_rate,
             mask_selection_length=mask_selection_length,
@@ -145,40 +132,29 @@ def get_config(self):
         config = super().get_config()
         config.update(
             {
-                "sequence_length": self.packer.sequence_length,
                 "mask_selection_rate": self.masker.mask_selection_rate,
                 "mask_selection_length": self.masker.mask_selection_length,
-                "truncate": self.packer.truncate,
             }
         )
         return config
 
     def call(self, x, y=None, sample_weight=None):
-        if y is not None:
-            raise ValueError(
-                "`RobertaMaskedLMPreprocessor` received labeled data (`y` is "
-                "not `None`). No labels should be passed in as "
-                "this layer generates training labels dynamically from raw "
-                "text features passed as `x`. Received: `y={y}`."
+        if y is not None or sample_weight is not None:
+            logging.warning(
+                f"{self.__class__.__name__} generates `y` and `sample_weight` "
+                "based on your input data, but your data already contains `y` "
+                "or `sample_weight`. Your `y` and `sample_weight` will be "
+                "ignored."
             )
 
-        x = convert_inputs_to_list_of_tensor_segments(x)
-        x = [self.tokenizer(segment) for segment in x]
-        token_ids = self.packer(x)
+        x = super().call(x)
+        token_ids, padding_mask = x["token_ids"], x["padding_mask"]
         masker_outputs = self.masker(token_ids)
         x = {
             "token_ids": masker_outputs["token_ids"],
-            "padding_mask": token_ids != self.tokenizer.pad_token_id,
+            "padding_mask": padding_mask,
             "mask_positions": masker_outputs["mask_positions"],
         }
         y = masker_outputs["mask_ids"]
         sample_weight = masker_outputs["mask_weights"]
         return pack_x_y_sample_weight(x, y, sample_weight)
-
-    @classproperty
-    def tokenizer_cls(cls):
-        return RobertaTokenizer
-
-    @classproperty
-    def presets(cls):
-        return copy.deepcopy(backbone_presets)
diff --git a/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py b/keras_nlp/models/roberta/roberta_masked_lm_preprocessor_test.py
@@ -110,12 +110,6 @@ def test_tokenize_list_of_strings(self):
         )
         self.assertAllEqual(sw, [[1.0, 1.0, 0.0, 0.0]] * 4)
 
-    def test_tokenize_labeled_errors(self):
-        x = tf.constant([" airplane at airport"] * 4)
-        y = tf.constant([1] * 4)
-        with self.assertRaises(ValueError):
-            self.preprocessor(x, y)
-
     def test_tokenize_dataset(self):
         sentences = tf.constant([" airplane at airport"] * 4)
         ds = tf.data.Dataset.from_tensor_slices(sentences)
@@ -168,11 +162,6 @@ def test_mask_multiple_sentences(self):
         self.assertAllEqual(y, [3, 5, 10, 0])
         self.assertAllEqual(sw, [1.0, 1.0, 1.0, 0.0])
 
-    def test_errors_for_2d_list_input(self):
-        ambiguous_input = [["one", "two"], ["three", "four"]]
-        with self.assertRaises(ValueError):
-            self.preprocessor(ambiguous_input)
-
     @parameterized.named_parameters(
         ("tf_format", "tf", "model"),
         ("keras_format", "keras_v3", "model.keras"),