diff --git a/keras_nlp/models/albert/albert_tokenizer.py b/keras_nlp/models/albert/albert_tokenizer.py index e2647c30c0..35dc7ec8d5 100644 --- a/keras_nlp/models/albert/albert_tokenizer.py +++ b/keras_nlp/models/albert/albert_tokenizer.py @@ -15,14 +15,12 @@ """ALBERT tokenizer.""" import copy -import os from tensorflow import keras from keras_nlp.models.albert.albert_presets import backbone_presets from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring @keras.utils.register_keras_serializable(package="keras_nlp") @@ -89,52 +87,3 @@ def __init__(self, proto, **kwargs): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate an ALBERT tokenizer from preset vocabulary. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.AlbertTokenizer.from_preset( - "albert_base_en_uncased", - ) - - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - spm_proto = keras.utils.get_file( - "vocab.spm", - metadata["spm_proto_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["spm_proto_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "proto": spm_proto, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/bert/bert_tokenizer.py b/keras_nlp/models/bert/bert_tokenizer.py index 8f1db18040..629e2a3c44 100644 --- a/keras_nlp/models/bert/bert_tokenizer.py +++ b/keras_nlp/models/bert/bert_tokenizer.py @@ -14,7 +14,6 @@ """BERT tokenizer.""" import copy -import os from tensorflow import keras @@ -22,7 +21,6 @@ from keras_nlp.models.bert.bert_presets import classifier_presets from keras_nlp.tokenizers.word_piece_tokenizer import WordPieceTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring PRESET_NAMES = ", ".join(list(backbone_presets) + list(classifier_presets)) @@ -112,52 +110,3 @@ def __init__( @classproperty def presets(cls): return copy.deepcopy({**backbone_presets, **classifier_presets}) - - @classmethod - @format_docstring(names=PRESET_NAMES) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate a BERT tokenizer from preset vocabulary. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.BertTokenizer.from_preset( - "bert_base_en_uncased", - ) - - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.txt", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py index aad3092ce5..dfd6ba83e4 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_tokenizer.py @@ -15,14 +15,12 @@ """DeBERTa tokenizer.""" import copy -import os from tensorflow import keras from keras_nlp.models.deberta_v3.deberta_v3_presets import backbone_presets from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring @keras.utils.register_keras_serializable(package="keras_nlp") @@ -89,52 +87,3 @@ def __init__(self, proto, **kwargs): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate a DeBERTa tokenizer from preset vocabulary. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.DebertaV3Tokenizer.from_preset( - "deberta_v3_base_en", - ) - - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - spm_proto = keras.utils.get_file( - "vocab.spm", - metadata["spm_proto_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["spm_proto_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "proto": spm_proto, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py index 103f015cca..3d0dd7d794 100644 --- a/keras_nlp/models/distil_bert/distil_bert_tokenizer.py +++ b/keras_nlp/models/distil_bert/distil_bert_tokenizer.py @@ -14,14 +14,12 @@ """DistilBERT tokenizer.""" import copy -import os from tensorflow import keras from keras_nlp.models.distil_bert.distil_bert_presets import backbone_presets from keras_nlp.tokenizers.word_piece_tokenizer import WordPieceTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring @keras.utils.register_keras_serializable(package="keras_nlp") @@ -109,52 +107,3 @@ def __init__( @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate a DistilBERT tokenizer from preset vocabulary. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.DistilBertTokenizer.from_preset( - "distil_bert_base_en_uncased", - ) - - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.txt", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/gpt2/gpt2_tokenizer.py b/keras_nlp/models/gpt2/gpt2_tokenizer.py index d6ce5ab60e..9665391a0a 100644 --- a/keras_nlp/models/gpt2/gpt2_tokenizer.py +++ b/keras_nlp/models/gpt2/gpt2_tokenizer.py @@ -14,14 +14,12 @@ """GPT-2 preprocessing layers.""" import copy -import os from tensorflow import keras from keras_nlp.models.gpt2.gpt2_presets import backbone_presets from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring @keras.utils.register_keras_serializable(package="keras_nlp") @@ -118,58 +116,3 @@ def __init__( @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate a GPT-2 tokenizer from preset vocabulary and merge rules. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset( - "gpt2_base_en", - ) - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.json", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], - ) - merges = keras.utils.get_file( - "merges.txt", - metadata["merges_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["merges_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - "merges": merges, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/roberta/roberta_tokenizer.py b/keras_nlp/models/roberta/roberta_tokenizer.py index 6778bd4286..933a28fc5c 100644 --- a/keras_nlp/models/roberta/roberta_tokenizer.py +++ b/keras_nlp/models/roberta/roberta_tokenizer.py @@ -15,14 +15,12 @@ """RoBERTa tokenizer.""" import copy -import os from tensorflow import keras from keras_nlp.models.roberta.roberta_presets import backbone_presets from keras_nlp.tokenizers.byte_pair_tokenizer import BytePairTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring @keras.utils.register_keras_serializable(package="keras_nlp") @@ -126,58 +124,3 @@ def __init__( @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate a RoBERTa tokenizer from preset vocabulary and merge rules. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.RobertaTokenizer.from_preset( - "roberta_base_en", - ) - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - # Detokenize some input. - tokenizer.detokenize([5, 6, 7, 8, 9]) - ``` - """ - - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - vocabulary = keras.utils.get_file( - "vocab.json", - metadata["vocabulary_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["vocabulary_hash"], - ) - merges = keras.utils.get_file( - "merges.txt", - metadata["merges_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["merges_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "vocabulary": vocabulary, - "merges": merges, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index 3851e1db34..a675e55530 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -15,7 +15,6 @@ """XLM-RoBERTa tokenizer.""" import copy -import os import tensorflow as tf from tensorflow import keras @@ -23,7 +22,6 @@ from keras_nlp.models.xlm_roberta.xlm_roberta_presets import backbone_presets from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer from keras_nlp.utils.python_utils import classproperty -from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tf_utils import tensor_to_string_list @@ -162,52 +160,3 @@ def detokenize(self, inputs): @classproperty def presets(cls): return copy.deepcopy(backbone_presets) - - @classmethod - @format_docstring(names=", ".join(backbone_presets)) - def from_preset( - cls, - preset, - **kwargs, - ): - """Instantiate an XLM-RoBERTa tokenizer from preset vocabulary. - - Args: - preset: string. Must be one of {{names}}. - - Examples: - ```python - # Load a preset tokenizer. - tokenizer = keras_nlp.models.XLMRobertaTokenizer.from_preset( - "xlm_roberta_base_multi", - ) - - # Tokenize some input. - tokenizer("The quick brown fox tripped.") - - # Detokenize some input. - tokenizer.detokenize(tf.constant([581, 63773, 119455, 6, 147797])) - ``` - """ - if preset not in cls.presets: - raise ValueError( - "`preset` must be one of " - f"""{", ".join(cls.presets)}. Received: {preset}.""" - ) - metadata = cls.presets[preset] - - spm_proto = keras.utils.get_file( - "vocab.spm", - metadata["spm_proto_url"], - cache_subdir=os.path.join("models", preset), - file_hash=metadata["spm_proto_hash"], - ) - - config = metadata["preprocessor_config"] - config.update( - { - "proto": spm_proto, - }, - ) - - return cls.from_config({**config, **kwargs}) diff --git a/keras_nlp/tokenizers/byte_pair_tokenizer.py b/keras_nlp/tokenizers/byte_pair_tokenizer.py index 3b61007a03..8d7e886c1b 100644 --- a/keras_nlp/tokenizers/byte_pair_tokenizer.py +++ b/keras_nlp/tokenizers/byte_pair_tokenizer.py @@ -20,6 +20,7 @@ """ import json +import os from typing import Iterable from typing import List @@ -27,6 +28,8 @@ from tensorflow import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.python_utils import classproperty +from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tf_utils import assert_tf_text_installed try: @@ -536,3 +539,90 @@ def _bpe_merge_and_update_cache(self, tokens): tokenized_words, axis=1, separator=" " ) self.cache.insert(tokens, tokenized_words) + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + **kwargs, + ): + """Instantiate {{model_name}} tokenizer from preset vocabulary. + + Args: + preset: string. Must be one of "{{preset_names}}". + + Examples: + ```python + # Load a preset tokenizer. + tokenizer = {{model_name}}.from_preset("{{example_preset_name}}") + + # Tokenize some input. + tokenizer("The quick brown fox tripped.") + + # Detokenize some input. + tokenizer.detokenize([5, 6, 7, 8, 9]) + ``` + """ + + if not cls.presets: + raise NotImplementedError( + "No presets have been created for this class" + ) + + if preset not in cls.presets: + raise ValueError( + "`preset` must be one of " + f"""{", ".join(cls.presets)}. Received: {preset}.""" + ) + metadata = cls.presets[preset] + + vocabulary = keras.utils.get_file( + "vocab.json", + metadata["vocabulary_url"], + cache_subdir=os.path.join("models", preset), + file_hash=metadata["vocabulary_hash"], + ) + merges = keras.utils.get_file( + "merges.txt", + metadata["merges_url"], + cache_subdir=os.path.join("models", preset), + file_hash=metadata["merges_hash"], + ) + + config = metadata["preprocessor_config"] + config.update( + { + "vocabulary": vocabulary, + "merges": merges, + }, + ) + + return cls.from_config({**config, **kwargs}) + + def __init_subclass__(cls, **kwargs): + # Use __init_subclass__ to setup a correct docstring for from_preset. + super().__init_subclass__(**kwargs) + + # If the subclass does not define from_preset, assign a wrapper so that + # each class can have an distinct docstring. + if "from_preset" not in cls.__dict__: + + def from_preset(calling_cls, *args, **kwargs): + return super(cls, calling_cls).from_preset(*args, **kwargs) + + cls.from_preset = classmethod(from_preset) + + # Format and assign the docstring unless the subclass has overridden it. + if cls.from_preset.__doc__ is None: + cls.from_preset.__func__.__doc__ = ( + BytePairTokenizer.from_preset.__doc__ + ) + format_docstring( + model_name=cls.__name__, + example_preset_name=next(iter(cls.presets), ""), + preset_names='", "'.join(cls.presets), + )(cls.from_preset.__func__) diff --git a/keras_nlp/tokenizers/sentence_piece_tokenizer.py b/keras_nlp/tokenizers/sentence_piece_tokenizer.py index a14276cd1a..fda70d4ffe 100644 --- a/keras_nlp/tokenizers/sentence_piece_tokenizer.py +++ b/keras_nlp/tokenizers/sentence_piece_tokenizer.py @@ -14,12 +14,15 @@ import base64 import binascii +import os from typing import List import tensorflow as tf from tensorflow import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.python_utils import classproperty +from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tf_utils import assert_tf_text_installed from keras_nlp.utils.tf_utils import tensor_to_string_list @@ -206,3 +209,83 @@ def tokenize(self, inputs): def detokenize(self, inputs): return self._sentence_piece.detokenize(inputs) + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + **kwargs, + ): + """Instantiate {{model_name}} tokenizer from preset vocabulary. + + Args: + preset: string. Must be one of "{{preset_names}}". + + Examples: + ```python + # Load a preset tokenizer. + tokenizer = {{model_name}}.from_preset("{{example_preset_name}}") + + # Tokenize some input. + tokenizer("The quick brown fox tripped.") + + # Detokenize some input. + tokenizer.detokenize([5, 6, 7, 8, 9]) + ``` + """ + + if not cls.presets: + raise NotImplementedError( + "No presets have been created for this class" + ) + + if preset not in cls.presets: + raise ValueError( + "`preset` must be one of " + f"""{", ".join(cls.presets)}. Received: {preset}.""" + ) + metadata = cls.presets[preset] + + spm_proto = keras.utils.get_file( + "vocab.spm", + metadata["spm_proto_url"], + cache_subdir=os.path.join("models", preset), + file_hash=metadata["spm_proto_hash"], + ) + + config = metadata["preprocessor_config"] + config.update( + { + "proto": spm_proto, + }, + ) + + return cls.from_config({**config, **kwargs}) + + def __init_subclass__(cls, **kwargs): + # Use __init_subclass__ to setup a correct docstring for from_preset. + super().__init_subclass__(**kwargs) + + # If the subclass does not define from_preset, assign a wrapper so that + # each class can have an distinct docstring. + if "from_preset" not in cls.__dict__: + + def from_preset(calling_cls, *args, **kwargs): + return super(cls, calling_cls).from_preset(*args, **kwargs) + + cls.from_preset = classmethod(from_preset) + + # Format and assign the docstring unless the subclass has overridden it. + if cls.from_preset.__doc__ is None: + cls.from_preset.__func__.__doc__ = ( + SentencePieceTokenizer.from_preset.__doc__ + ) + format_docstring( + model_name=cls.__name__, + example_preset_name=next(iter(cls.presets), ""), + preset_names='", "'.join(cls.presets), + )(cls.from_preset.__func__) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 00ffccc652..ed593a3297 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os from typing import Iterable from typing import List @@ -19,6 +20,8 @@ from tensorflow import keras from keras_nlp.tokenizers import tokenizer +from keras_nlp.utils.python_utils import classproperty +from keras_nlp.utils.python_utils import format_docstring from keras_nlp.utils.tf_utils import assert_tf_text_installed try: @@ -416,3 +419,83 @@ def tokenize(self, inputs): def detokenize(self, inputs): return self._fast_word_piece.detokenize(inputs) + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + **kwargs, + ): + """Instantiate {{model_name}} tokenizer from preset vocabulary. + + Args: + preset: string. Must be one of "{{preset_names}}". + + Examples: + ```python + # Load a preset tokenizer. + tokenizer = {{model_name}}.from_preset("{{example_preset_name}}") + + # Tokenize some input. + tokenizer("The quick brown fox tripped.") + + # Detokenize some input. + tokenizer.detokenize([5, 6, 7, 8, 9]) + ``` + """ + + if not cls.presets: + raise NotImplementedError( + "No presets have been created for this class" + ) + + if preset not in cls.presets: + raise ValueError( + "`preset` must be one of " + f"""{", ".join(cls.presets)}. Received: {preset}.""" + ) + metadata = cls.presets[preset] + + vocabulary = keras.utils.get_file( + "vocab.txt", + metadata["vocabulary_url"], + cache_subdir=os.path.join("models", preset), + file_hash=metadata["vocabulary_hash"], + ) + + config = metadata["preprocessor_config"] + config.update( + { + "vocabulary": vocabulary, + }, + ) + + return cls.from_config({**config, **kwargs}) + + def __init_subclass__(cls, **kwargs): + # Use __init_subclass__ to setup a correct docstring for from_preset. + super().__init_subclass__(**kwargs) + + # If the subclass does not define from_preset, assign a wrapper so that + # each class can have an distinct docstring. + if "from_preset" not in cls.__dict__: + + def from_preset(calling_cls, *args, **kwargs): + return super(cls, calling_cls).from_preset(*args, **kwargs) + + cls.from_preset = classmethod(from_preset) + + # Format and assign the docstring unless the subclass has overridden it. + if cls.from_preset.__doc__ is None: + cls.from_preset.__func__.__doc__ = ( + WordPieceTokenizer.from_preset.__doc__ + ) + format_docstring( + model_name=cls.__name__, + example_preset_name=next(iter(cls.presets), ""), + preset_names='", "'.join(cls.presets), + )(cls.from_preset.__func__)