Add a tokenizer for the Mistral backbone (#1383)

tirthasheshpatel · web-flow · commit 24bb087b20fa · 2024-01-02T11:20:23.000-07:00
diff --git a/keras_nlp/models/mistral/mistral_tokenizer.py b/keras_nlp/models/mistral/mistral_tokenizer.py
@@ -0,0 +1,75 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_nlp.api_export import keras_nlp_export
+from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer
+
+
+@keras_nlp_export("keras_nlp.models.MistralTokenizer")
+class MistralTokenizer(SentencePieceTokenizer):
+    """Mistral tokenizer layer based on SentencePiece.
+
+    This tokenizer class will tokenize raw strings into integer sequences and
+    is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the
+    underlying tokenizer, it will check for all special tokens needed by
+    Mistral models and provides a `from_preset()` method to automatically
+    download a matching vocabulary for a Mistral preset.
+
+    This tokenizer does not provide truncation or padding of inputs. It can be
+    combined with a `keras_nlp.models.MistralPreprocessor` layer for input
+    packing.
+
+    If input is a batch of strings (rank > 0), the layer will output a
+    `tf.RaggedTensor` where the last dimension of the output is ragged.
+
+    If input is a scalar string (rank == 0), the layer will output a dense
+    `tf.Tensor` with static shape `[None]`.
+
+    Args:
+        proto: Either a `string` path to a SentencePiece proto file, or a
+            `bytes` object with a serialized SentencePiece proto. See the
+            [SentencePiece repository](https://github.com/google/sentencepiece)
+            for more details on the format.
+
+    Examples:
+    ```python
+    # Unbatched input.
+    tokenizer = keras_nlp.models.MistralTokenizer.from_preset(
+        "mistral_base_en",
+    )
+    tokenizer("The quick brown fox jumped.")
+
+    # Batched input.
+    tokenizer(["The quick brown fox jumped.", "The fox slept."])
+
+    # Detokenization.
+    tokenizer.detokenize(tokenizer("The quick brown fox jumped."))
+    ```
+    """
+
+    def __init__(self, proto, **kwargs):
+        super().__init__(proto=proto, **kwargs)
+
+        # Check for necessary special tokens.
+        start_token = "<s>"
+        end_token = "</s>"
+        for token in [start_token, end_token]:
+            if token not in self.get_vocabulary():
+                raise ValueError(
+                    f"Cannot find token `'{token}'` in the provided "
+                    f"`vocabulary`. Please provide `'{token}'` in your "
+                    "`vocabulary` or use a pretrained `vocabulary` name."
+                )
+
+        self.start_token_id = self.token_to_id(start_token)
+        self.end_token_id = self.token_to_id(end_token)
diff --git a/keras_nlp/models/mistral/mistral_tokenizer_test.py b/keras_nlp/models/mistral/mistral_tokenizer_test.py
@@ -0,0 +1,46 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from keras_nlp.models.mistral.mistral_tokenizer import MistralTokenizer
+from keras_nlp.tests.test_case import TestCase
+
+
+class MistralTokenizerTest(TestCase):
+    def setUp(self):
+        self.init_kwargs = {
+            # Generated using create_mistral_test_proto.py
+            "proto": os.path.join(
+                self.get_test_data_dir(), "mistral_test_vocab.spm"
+            )
+        }
+        self.input_data = ["the quick brown fox", "the earth is round"]
+
+    def test_tokenizer_basics(self):
+        self.run_preprocessing_layer_test(
+            cls=MistralTokenizer,
+            init_kwargs=self.init_kwargs,
+            input_data=self.input_data,
+            expected_output=[[3, 8, 4, 6], [3, 5, 7, 9]],
+        )
+
+    def test_errors_missing_special_tokens(self):
+        with self.assertRaises(ValueError):
+            MistralTokenizer(
+                # Generated using create_no_special_token_proto.py
+                proto=os.path.join(
+                    self.get_test_data_dir(), "no_special_token_vocab.spm"
+                )
+            )
diff --git a/keras_nlp/tests/test_data/mistral_test_vocab.spm b/keras_nlp/tests/test_data/mistral_test_vocab.spm
diff --git a/tools/sentencepiece_testing/create_mistral_test_proto.py b/tools/sentencepiece_testing/create_mistral_test_proto.py
@@ -0,0 +1,32 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tools.sentencepiece_testing.utils import train_sentencepiece
+
+
+def main():
+    train_sentencepiece(
+        ["the quick brown fox", "the earth is round"],
+        "mistral_test_vocab.spm",
+        vocab_size=10,
+        model_type="WORD",
+        pad_id=-1,
+        unk_id=0,
+        bos_id=1,
+        eos_id=2,
+    )
+
+
+if __name__ == "__main__":
+    main()