diff --git a/keras_nlp/models/__init__.py b/keras_nlp/models/__init__.py index bdd6042538..21d8394066 100644 --- a/keras_nlp/models/__init__.py +++ b/keras_nlp/models/__init__.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor +from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer from keras_nlp.models.bert.bert_backbone import BertBackbone from keras_nlp.models.bert.bert_classifier import BertClassifier from keras_nlp.models.bert.bert_preprocessor import BertPreprocessor diff --git a/keras_nlp/models/albert/__init__.py b/keras_nlp/models/albert/__init__.py new file mode 100644 index 0000000000..6e4df4e727 --- /dev/null +++ b/keras_nlp/models/albert/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/keras_nlp/models/albert/albert_preprocessor.py b/keras_nlp/models/albert/albert_preprocessor.py new file mode 100644 index 0000000000..61d37c34d2 --- /dev/null +++ b/keras_nlp/models/albert/albert_preprocessor.py @@ -0,0 +1,201 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ALBERT preprocessor layer.""" + +from tensorflow import keras + +from keras_nlp.layers.multi_segment_packer import MultiSegmentPacker +from keras_nlp.utils.keras_utils import ( + convert_inputs_to_list_of_tensor_segments, +) +from keras_nlp.utils.keras_utils import pack_x_y_sample_weight +from keras_nlp.utils.python_utils import classproperty + + +@keras.utils.register_keras_serializable(package="keras_nlp") +class AlbertPreprocessor(keras.layers.Layer): + """An ALBERT preprocessing layer which tokenizes and packs inputs. + + This preprocessing layer will do three things: + + - Tokenize any number of input segments using the `tokenizer`. + - Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`. + with the appropriate `"[CLS]"`, `"[SEP]"` and `""` tokens. + - Construct a dictionary with keys `"token_ids"`, `"segment_ids"` and + `"padding_mask"`, that can be passed directly to + `keras_nlp.models.AlbertBackbone`. + + This layer can be used directly with `tf.data.Dataset.map` to preprocess + string data in the `(x, y, sample_weight)` format used by + `keras.Model.fit`. + + The call method of this layer accepts three arguments, `x`, `y`, and + `sample_weight`. `x` can be a python string or tensor representing a single + segment, a list of python strings representing a batch of single segments, + or a list of tensors representing multiple segments to be packed together. + `y` and `sample_weight` are both optional, can have any format, and will be + passed through unaltered. + + Special care should be taken when using `tf.data` to map over an unlabeled + tuple of string segments. `tf.data.Dataset.map` will unpack this tuple + directly into the call arguments of this layer, rather than forward all + argument to `x`. To handle this case, it is recommended to explicitly call + the layer, e.g. `ds.map(lambda seg1, seg2: preprocessor(x=(seg1, seg2)))`. + + Args: + tokenizer: A `keras_nlp.models.AlbertTokenizer` instance. + sequence_length: The length of the packed inputs. + truncate: string. The algorithm to truncate a list of batched segments + to fit within `sequence_length`. The value can be either + `round_robin` or `waterfall`: + - `"round_robin"`: Available space is assigned one token at a + time in a round-robin fashion to the inputs that still need + some, until the limit is reached. + - `"waterfall"`: The allocation of the budget is done using a + "waterfall" algorithm that allocates quota in a + left-to-right manner and fills up the buckets until we run + out of budget. It supports an arbitrary number of segments. + + Examples: + ```python + tokenizer = keras_nlp.models.AlbertTokenizer(proto="model.spm") + preprocessor = keras_nlp.models.AlbertPreprocessor( + tokenizer=tokenizer, + sequence_length=10, + ) + + # Tokenize and pack a single sentence. + sentence = tf.constant("The quick brown fox jumped.") + preprocessor(sentence) + # Same output. + preprocessor("The quick brown fox jumped.") + + # Tokenize and a batch of single sentences. + sentences = tf.constant( + ["The quick brown fox jumped.", "Call me Ishmael."] + ) + preprocessor(sentences) + # Same output. + preprocessor( + ["The quick brown fox jumped.", "Call me Ishmael."] + ) + + # Tokenize and pack a sentence pair. + first_sentence = tf.constant("The quick brown fox jumped.") + second_sentence = tf.constant("The fox tripped.") + preprocessor((first_sentence, second_sentence)) + + # Map a dataset to preprocess a single sentence. + features = tf.constant( + ["The quick brown fox jumped.", "Call me Ishmael."] + ) + labels = tf.constant([0, 1]) + ds = tf.data.Dataset.from_tensor_slices((features, labels)) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map a dataset to preprocess sentence pairs. + first_sentences = tf.constant( + ["The quick brown fox jumped.", "Call me Ishmael."] + ) + second_sentences = tf.constant( + ["The fox tripped.", "Oh look, a whale."] + ) + labels = tf.constant([1, 1]) + ds = tf.data.Dataset.from_tensor_slices( + ( + (first_sentences, second_sentences), labels + ) + ) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map a dataset to preprocess unlabeled sentence pairs. + first_sentences = tf.constant( + ["The quick brown fox jumped.", "Call me Ishmael."] + ) + second_sentences = tf.constant( + ["The fox tripped.", "Oh look, a whale."] + ) + ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences)) + # Watch out for tf.data's default unpacking of tuples here! + # Best to invoke the `preprocessor` directly in this case. + ds = ds.map( + lambda s1, s2: preprocessor(x=(s1, s2)), + num_parallel_calls=tf.data.AUTOTUNE, + ) + ``` + """ + + def __init__( + self, + tokenizer, + sequence_length=512, + truncate="round_robin", + **kwargs, + ): + super().__init__(**kwargs) + self._tokenizer = tokenizer + self.packer = MultiSegmentPacker( + start_value=self.tokenizer.cls_token_id, + end_value=self.tokenizer.sep_token_id, + pad_value=self.tokenizer.pad_token_id, + truncate=truncate, + sequence_length=sequence_length, + ) + + @property + def tokenizer(self): + """The `keras_nlp.models.AlbertTokenizer` used to tokenize strings.""" + return self._tokenizer + + def get_config(self): + config = super().get_config() + config.update( + { + "tokenizer": keras.layers.serialize(self.tokenizer), + "sequence_length": self.packer.sequence_length, + "truncate": self.packer.truncate, + } + ) + return config + + @classmethod + def from_config(cls, config): + if "tokenizer" in config and isinstance(config["tokenizer"], dict): + config["tokenizer"] = keras.layers.deserialize(config["tokenizer"]) + return cls(**config) + + def call(self, x, y=None, sample_weight=None): + x = convert_inputs_to_list_of_tensor_segments(x) + x = [self.tokenizer(segment) for segment in x] + token_ids, segment_ids = self.packer(x) + x = { + "token_ids": token_ids, + "segment_ids": segment_ids, + "padding_mask": token_ids != self.tokenizer.pad_token_id, + } + return pack_x_y_sample_weight(x, y, sample_weight) + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + sequence_length=None, + truncate="round_robin", + **kwargs, + ): + raise NotImplementedError diff --git a/keras_nlp/models/albert/albert_preprocessor_test.py b/keras_nlp/models/albert/albert_preprocessor_test.py new file mode 100644 index 0000000000..01267a6edc --- /dev/null +++ b/keras_nlp/models/albert/albert_preprocessor_test.py @@ -0,0 +1,174 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ALBERT preprocessor layer.""" + +import io +import os + +import sentencepiece +import tensorflow as tf +from absl.testing import parameterized +from tensorflow import keras + +from keras_nlp.models.albert.albert_preprocessor import AlbertPreprocessor +from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer + + +class AlbertPreprocessorTest(tf.test.TestCase, parameterized.TestCase): + def setUp(self): + bytes_io = io.BytesIO() + vocab_data = tf.data.Dataset.from_tensor_slices( + ["the quick brown fox", "the earth is round"] + ) + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=vocab_data.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=10, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + ) + self.proto = bytes_io.getvalue() + + self.preprocessor = AlbertPreprocessor( + tokenizer=AlbertTokenizer(proto=self.proto), + sequence_length=12, + ) + + def test_tokenize_strings(self): + input_data = "the quick brown fox" + output = self.preprocessor(input_data) + self.assertAllEqual( + output["token_ids"], [2, 4, 9, 5, 7, 3, 0, 0, 0, 0, 0, 0] + ) + self.assertAllEqual( + output["segment_ids"], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + ) + self.assertAllEqual( + output["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0] + ) + + def test_tokenize_list_of_strings(self): + # We should handle a list of strings as as batch. + input_data = ["the quick brown fox"] * 4 + output = self.preprocessor(input_data) + self.assertAllEqual( + output["token_ids"], + [[2, 4, 9, 5, 7, 3, 0, 0, 0, 0, 0, 0]] * 4, + ) + self.assertAllEqual( + output["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 + ) + self.assertAllEqual( + output["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 + ) + + def test_tokenize_labeled_batch(self): + x = tf.constant(["the quick brown fox"] * 4) + y = tf.constant([1] * 4) + sw = tf.constant([1.0] * 4) + x_out, y_out, sw_out = self.preprocessor(x, y, sw) + self.assertAllEqual( + x_out["token_ids"], + [[2, 4, 9, 5, 7, 3, 0, 0, 0, 0, 0, 0]] * 4, + ) + self.assertAllEqual( + x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 + ) + self.assertAllEqual( + x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 + ) + self.assertAllEqual(y_out, y) + self.assertAllEqual(sw_out, sw) + + def test_tokenize_labeled_dataset(self): + x = tf.constant(["the quick brown fox"] * 4) + y = tf.constant([1] * 4) + sw = tf.constant([1.0] * 4) + ds = tf.data.Dataset.from_tensor_slices((x, y, sw)) + ds = ds.map(self.preprocessor) + x_out, y_out, sw_out = ds.batch(4).take(1).get_single_element() + self.assertAllEqual( + x_out["token_ids"], + [[2, 4, 9, 5, 7, 3, 0, 0, 0, 0, 0, 0]] * 4, + ) + self.assertAllEqual( + x_out["segment_ids"], [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] * 4 + ) + self.assertAllEqual( + x_out["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]] * 4 + ) + self.assertAllEqual(y_out, y) + self.assertAllEqual(sw_out, sw) + + def test_tokenize_multiple_sentences(self): + sentence_one = tf.constant("the quick brown fox") + sentence_two = tf.constant("the earth") + output = self.preprocessor((sentence_one, sentence_two)) + self.assertAllEqual( + output["token_ids"], + [2, 4, 9, 5, 7, 3, 4, 6, 3, 0, 0, 0], + ) + self.assertAllEqual( + output["segment_ids"], [0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0] + ) + self.assertAllEqual( + output["padding_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0] + ) + + def test_tokenize_multiple_batched_sentences(self): + sentence_one = tf.constant(["the quick brown fox"] * 4) + sentence_two = tf.constant(["the earth"] * 4) + # The first tuple or list is always interpreted as an enumeration of + # separate sequences to concatenate. + output = self.preprocessor((sentence_one, sentence_two)) + self.assertAllEqual( + output["token_ids"], + [[2, 4, 9, 5, 7, 3, 4, 6, 3, 0, 0, 0]] * 4, + ) + self.assertAllEqual( + output["segment_ids"], [[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]] * 4 + ) + self.assertAllEqual( + output["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]] * 4 + ) + + def test_errors_for_2d_list_input(self): + ambiguous_input = [["one", "two"], ["three", "four"]] + with self.assertRaises(ValueError): + self.preprocessor(ambiguous_input) + + @parameterized.named_parameters( + ("tf_format", "tf", "model"), + ("keras_format", "keras_v3", "model.keras"), + ) + def test_saved_model(self, save_format, filename): + input_data = tf.constant(["the quick brown fox"]) + inputs = keras.Input(dtype="string", shape=()) + outputs = self.preprocessor(inputs) + model = keras.Model(inputs, outputs) + path = os.path.join(self.get_temp_dir(), filename) + model.save(path, save_format=save_format) + restored_model = keras.models.load_model(path) + self.assertAllEqual( + model(input_data)["token_ids"], + restored_model(input_data)["token_ids"], + ) diff --git a/keras_nlp/models/albert/albert_tokenizer.py b/keras_nlp/models/albert/albert_tokenizer.py new file mode 100644 index 0000000000..e9e5ac19fc --- /dev/null +++ b/keras_nlp/models/albert/albert_tokenizer.py @@ -0,0 +1,95 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ALBERT tokenizer.""" + + +from tensorflow import keras + +from keras_nlp.tokenizers.sentence_piece_tokenizer import SentencePieceTokenizer +from keras_nlp.utils.python_utils import classproperty + + +@keras.utils.register_keras_serializable(package="keras_nlp") +class AlbertTokenizer(SentencePieceTokenizer): + """ALBERT tokenizer layer based on SentencePiece. + + This tokenizer class will tokenize raw strings into integer sequences and + is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the + underlying tokenizer, it will check for all special tokens needed by + ALBERT models and provides a `from_preset()` method to automatically + download a matching vocabulary for a ALBERT preset. + + This tokenizer does not provide truncation or padding of inputs. It can be + combined with a `keras_nlp.models.AlbertPreprocessor` layer for input + packing. + + If input is a batch of strings (rank > 0), the layer will output a + `tf.RaggedTensor` where the last dimension of the output is ragged. + + If input is a scalar string (rank == 0), the layer will output a dense + `tf.Tensor` with static shape `[None]`. + + Args: + proto: Either a `string` path to a SentencePiece proto file, or a + `bytes` object with a serialized SentencePiece proto. See the + [SentencePiece repository](https://github.com/google/sentencepiece) + for more details on the format. + + Examples: + + ```python + tokenizer = keras_nlp.models.AlbertTokenizer(proto="model.spm") + + # Batched inputs. + tokenizer(["the quick brown fox", "the earth is round"]) + + # Unbatched inputs. + tokenizer("the quick brown fox") + + # Detokenization. + tokenizer.detokenize(tf.constant([[[2, 14, 2231, 886, 2385, 3]])) + ``` + """ + + def __init__(self, proto, **kwargs): + super().__init__(proto=proto, **kwargs) + + # Check for necessary special tokens. + cls_token = "[CLS]" + sep_token = "[SEP]" + pad_token = "" + for token in [cls_token, sep_token, pad_token]: + if token not in self.get_vocabulary(): + raise ValueError( + f"Cannot find token `'{token}'` in the provided " + f"`vocabulary`. Please provide `'{token}'` in your " + "`vocabulary` or use a pretrained `vocabulary` name." + ) + + self.cls_token_id = self.token_to_id(cls_token) + self.sep_token_id = self.token_to_id(sep_token) + self.pad_token_id = self.token_to_id(pad_token) + + @classproperty + def presets(cls): + return {} + + @classmethod + def from_preset( + cls, + preset, + **kwargs, + ): + raise NotImplementedError diff --git a/keras_nlp/models/albert/albert_tokenizer_test.py b/keras_nlp/models/albert/albert_tokenizer_test.py new file mode 100644 index 0000000000..0116f11e22 --- /dev/null +++ b/keras_nlp/models/albert/albert_tokenizer_test.py @@ -0,0 +1,89 @@ +# Copyright 2022 The KerasNLP Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ALBERT tokenizer.""" + +import io +import os + +import sentencepiece +import tensorflow as tf +from absl.testing import parameterized +from tensorflow import keras + +from keras_nlp.models.albert.albert_tokenizer import AlbertTokenizer + + +class AlbertTokenizerTest(tf.test.TestCase, parameterized.TestCase): + def setUp(self): + bytes_io = io.BytesIO() + vocab_data = tf.data.Dataset.from_tensor_slices( + ["the quick brown fox", "the earth is round"] + ) + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=vocab_data.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=10, + model_type="WORD", + pad_id=0, + unk_id=1, + bos_id=2, + eos_id=3, + pad_piece="", + unk_piece="", + bos_piece="[CLS]", + eos_piece="[SEP]", + ) + self.proto = bytes_io.getvalue() + + self.tokenizer = AlbertTokenizer(proto=self.proto) + + def test_tokenize(self): + input_data = "the quick brown fox" + output = self.tokenizer(input_data) + self.assertAllEqual(output, [4, 9, 5, 7]) + + def test_tokenize_batch(self): + input_data = tf.constant(["the quick brown fox", "the earth is round"]) + output = self.tokenizer(input_data) + self.assertAllEqual(output, [[4, 9, 5, 7], [4, 6, 8, 1]]) + + def test_detokenize(self): + input_data = tf.constant([[4, 9, 5, 7]]) + output = self.tokenizer.detokenize(input_data) + self.assertEqual(output, tf.constant(["the quick brown fox"])) + + def test_vocabulary_size(self): + tokenizer = AlbertTokenizer(proto=self.proto) + self.assertEqual(tokenizer.vocabulary_size(), 10) + + @parameterized.named_parameters( + ("tf_format", "tf", "model"), + ("keras_format", "keras_v3", "model.keras"), + ) + def test_saved_model(self, save_format, filename): + input_data = tf.constant(["the quick brown fox"]) + + inputs = keras.Input(dtype="string", shape=()) + outputs = self.tokenizer(inputs) + model = keras.Model(inputs, outputs) + + path = os.path.join(self.get_temp_dir(), filename) + model.save(path, save_format=save_format) + + restored_model = keras.models.load_model(path) + self.assertAllEqual( + model(input_data), + restored_model(input_data), + )