diff --git a/keras_nlp/models/albert/albert_backbone.py b/keras_nlp/models/albert/albert_backbone.py index 414bb97e87..99a7cc97a3 100644 --- a/keras_nlp/models/albert/albert_backbone.py +++ b/keras_nlp/models/albert/albert_backbone.py @@ -118,67 +118,47 @@ def __init__( f"`num_layers={num_layers}` and `num_groups={num_groups}`." ) - # Index of classification token in the vocabulary - cls_token_index = 0 - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - segment_id_input = keras.Input( - shape=(None,), dtype="int32", name="segment_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens, positions, and segment ids. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=embedding_dim, embeddings_initializer=albert_kernel_initializer(), name="token_embedding", ) - token_embedding = token_embedding_layer(token_id_input) - position_embedding = PositionEmbedding( + self.position_embedding = PositionEmbedding( initializer=albert_kernel_initializer(), sequence_length=max_sequence_length, name="position_embedding", - )(token_embedding) - segment_embedding = keras.layers.Embedding( + ) + self.segment_embedding = keras.layers.Embedding( input_dim=num_segments, output_dim=embedding_dim, embeddings_initializer=albert_kernel_initializer(), name="segment_embedding", - )(segment_id_input) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.Add()( - (token_embedding, position_embedding, segment_embedding) ) - x = keras.layers.LayerNormalization( + self.embeddings_add = keras.layers.Add( + name="embeddings_add", + ) + self.embeddings_layer_norm = keras.layers.LayerNormalization( name="embeddings_layer_norm", axis=-1, epsilon=1e-12, dtype="float32", - )(x) - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Project the embedding to `hidden_dim`. - x = keras.layers.Dense( + ) + self.embeddings_projection = keras.layers.Dense( hidden_dim, kernel_initializer=albert_kernel_initializer(), name="embedding_projection", - )(x) - - def get_group_layer(group_idx): - """Defines a group `num_inner_repetitions` transformer layers and - returns the callable. - """ - transformer_layers = [ - TransformerEncoder( + ) + self.transformer_layers = [] + for group_idx in range(num_groups): + inner_layers = [] + for inner_idx in range(num_inner_repetitions): + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation=gelu_approximate, @@ -187,43 +167,51 @@ def get_group_layer(group_idx): kernel_initializer=albert_kernel_initializer(), name=f"group_{group_idx}_inner_layer_{inner_idx}", ) - for inner_idx in range(num_inner_repetitions) - ] - - def call(x, padding_mask): - for transformer_layer in transformer_layers: - x = transformer_layer(x, padding_mask=padding_mask) - return x - - return call - - num_calls_per_group = num_layers // num_groups - for group_idx in range(num_groups): - # Define the group. A group in ALBERT terminology is any number of - # repeated attention and FFN blocks. - group_layer = get_group_layer(group_idx) - - # Assume num_layers = 8, num_groups = 4. Then, the order of group - # calls will be 0, 0, 1, 1, 2, 2, 3, 3. - for call in range(num_calls_per_group): - x = group_layer(x, padding_mask=padding_mask) - - # Construct the two ALBERT outputs. The pooled output is a dense layer on - # top of the [CLS] token. - sequence_output = x - pooled_output = keras.layers.Dense( + inner_layers.append(layer) + self.transformer_layers.append(inner_layers) + self.pooled_dense = keras.layers.Dense( hidden_dim, kernel_initializer=albert_kernel_initializer(), activation="tanh", name="pooled_dense", - )(x[:, cls_token_index, :]) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + # Inputs + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + # Embed tokens, positions, and segment ids. + tokens = self.token_embedding(token_id_input) + positions = self.position_embedding(tokens) + segments = self.segment_embedding(segment_id_input) + # Sum, normalize and apply dropout to embeddings. + x = self.embeddings_add((tokens, positions, segments)) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + x = self.embeddings_projection(x) + # Call transformer layers with repeated groups. + num_calls_per_group = num_layers // num_groups + for group in self.transformer_layers: + for _ in range(num_calls_per_group): + for transformer_layer in group: + x = transformer_layer(x, padding_mask=padding_mask_input) + # Construct the two ALBERT outputs. The pooled output is a dense layer + # on top of the [CLS] token. + sequence_output = x + cls_token_index = 0 + pooled_output = self.pooled_dense(x[:, cls_token_index, :]) super().__init__( inputs={ "token_ids": token_id_input, "segment_ids": segment_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs={ "sequence_output": sequence_output, @@ -231,7 +219,8 @@ def call(x, padding_mask): }, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -244,7 +233,6 @@ def call(x, padding_mask): self.max_sequence_length = max_sequence_length self.num_segments = num_segments self.cls_token_index = cls_token_index - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/albert/albert_classifier.py b/keras_nlp/models/albert/albert_classifier.py index b0ed7bca7c..e8621cf6b4 100644 --- a/keras_nlp/models/albert/albert_classifier.py +++ b/keras_nlp/models/albert/albert_classifier.py @@ -155,30 +155,37 @@ def __init__( dropout=0.1, **kwargs, ): - inputs = backbone.input - pooled = backbone(inputs)["pooled_output"] - pooled = keras.layers.Dropout(dropout)(pooled) - outputs = keras.layers.Dense( + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=albert_kernel_initializer(), activation=activation, name="logits", - )(pooled) - # Instantiate using Functional API Model constructor + ) + self.output_dropout = keras.layers.Dropout( + dropout, + name="output_dropout", + ) + + # === Functional Model === + inputs = backbone.input + pooled = backbone(inputs)["pooled_output"] + pooled = self.output_dropout(pooled) + outputs = self.output_dense(pooled) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self._backbone = backbone - self._preprocessor = preprocessor + + # === Config === self.num_classes = num_classes self.activation = keras.activations.get(activation) self.dropout = dropout - # Default compilation + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py index e95af7c207..139be0a762 100644 --- a/keras_nlp/models/albert/albert_masked_lm.py +++ b/keras_nlp/models/albert/albert_masked_lm.py @@ -97,32 +97,35 @@ class AlbertMaskedLM(Task): """ def __init__(self, backbone, preprocessor=None, **kwargs): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation=gelu_approximate, + kernel_initializer=albert_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( shape=(None,), dtype="int32", name="mask_positions" ), } - backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation=gelu_approximate, - kernel_initializer=albert_kernel_initializer(), - name="mlm_head", - )(backbone_outputs["sequence_output"], inputs["mask_positions"]) - + outputs = self.masked_lm_head( + backbone_outputs["sequence_output"], inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, - **kwargs + **kwargs, ) - self.backbone = backbone - self.preprocessor = preprocessor - + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/backbone.py b/keras_nlp/models/backbone.py index 69da56593b..9e22be3f44 100644 --- a/keras_nlp/models/backbone.py +++ b/keras_nlp/models/backbone.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from keras_nlp.backend import config from keras_nlp.backend import keras from keras_nlp.utils.preset_utils import check_preset_class from keras_nlp.utils.preset_utils import load_from_preset @@ -23,24 +24,40 @@ class Backbone(keras.Model): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._token_embedding = None self._functional_layer_ids = set( id(layer) for layer in self._flatten_layers() ) + self._initialized = True def __dir__(self): - # Temporary fixes for weight saving. This mimics the following PR for + if config.keras_3(): + return super().__dir__() + + # Temporary fixes for Keras 2 saving. This mimics the following PR for # older version of Keras: https://github.com/keras-team/keras/pull/18982 def filter_fn(attr): - if attr == "_layer_checkpoint_dependencies": + if attr in [ + "_layer_checkpoint_dependencies", + "transformer_layers", + "encoder_transformer_layers", + "decoder_transformer_layers", + ]: return False return id(getattr(self, attr)) not in self._functional_layer_ids return filter(filter_fn, super().__dir__()) def __setattr__(self, name, value): - # Work around torch setattr for properties. - if name in ["token_embedding"]: + # Work around setattr issues for Keras 2 and Keras 3 torch backend. + # Since all our state is covered by functional model we can route + # around custom setattr calls. + is_property = isinstance(getattr(type(self), name, None), property) + is_unitialized = not hasattr(self, "_initialized") + is_torch = config.backend() == "torch" + is_keras_2 = not config.keras_3() + if is_torch and (is_property or is_unitialized): + return object.__setattr__(self, name, value) + if is_keras_2 and is_unitialized: return object.__setattr__(self, name, value) return super().__setattr__(name, value) @@ -48,18 +65,13 @@ def __setattr__(self, name, value): def token_embedding(self): """A `keras.layers.Embedding` instance for embedding token ids. - This layer integer token ids to the hidden dim of the model. + This layer embeds integer token ids to the hidden dim of the model. """ return self._token_embedding @token_embedding.setter def token_embedding(self, value): - # Workaround tf.keras h5 checkpoint loading, which is sensitive to layer - # count mismatches and does not deduplicate layers. This could go away - # if we update our checkpoints to the newer `.weights.h5` format. - self._setattr_tracking = False self._token_embedding = value - self._setattr_tracking = True def get_config(self): # Don't chain to super here. The default `get_config()` for functional diff --git a/keras_nlp/models/bart/bart_backbone.py b/keras_nlp/models/bart/bart_backbone.py index 2679b84a9f..fdb8e5df5b 100644 --- a/keras_nlp/models/bart/bart_backbone.py +++ b/keras_nlp/models/bart/bart_backbone.py @@ -102,59 +102,34 @@ def __init__( max_sequence_length=1024, **kwargs, ): - # Encoder inputs - encoder_token_id_input = keras.Input( - shape=(None,), dtype="int32", name="encoder_token_ids" - ) - encoder_padding_mask = keras.Input( - shape=(None,), dtype="int32", name="encoder_padding_mask" - ) - - # Decoder inputs. - decoder_token_id_input = keras.Input( - shape=(None,), dtype="int32", name="decoder_token_ids" - ) - decoder_padding_mask = keras.Input( - shape=(None,), dtype="int32", name="decoder_padding_mask" - ) - - # Token embedding layer. This layer is shared by encoder and decoder. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=bart_kernel_initializer(), name="token_embedding", ) - - # ===== Encoder ===== - - # Embed tokens and positions. - token_embedding = token_embedding_layer(encoder_token_id_input) - # Position embedding parameters are not shared by encode and decoder. - position_embedding = PositionEmbedding( + self.encoder_position_embedding = PositionEmbedding( initializer=bart_kernel_initializer(), sequence_length=max_sequence_length, name="encoder_position_embedding", - )(token_embedding) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.Add(name="encoder_embeddings_add")( - (token_embedding, position_embedding) ) - x = keras.layers.LayerNormalization( + self.encoder_embeddings_add = keras.layers.Add( + name="encoder_embeddings_add", + ) + self.encoder_embeddings_layer_norm = keras.layers.LayerNormalization( name="encoder_embeddings_layer_norm", axis=-1, epsilon=1e-5, dtype="float32", - )(x) - x = keras.layers.Dropout( + ) + self.encoder_embeddings_dropout = keras.layers.Dropout( dropout, name="encoder_embeddings_dropout", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.encoder_transformer_layers = [] for i in range(num_layers): - x = TransformerEncoder( + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation=keras.activations.gelu, @@ -162,39 +137,29 @@ def __init__( layer_norm_epsilon=1e-5, kernel_initializer=bart_kernel_initializer(), name=f"transformer_encoder_layer_{i}", - )(x, padding_mask=encoder_padding_mask) - - encoder_output = x - - # ===== Decoder ===== - - # Embed tokens and positions. - token_embedding = token_embedding_layer(decoder_token_id_input) - # Position embedding parameters are not shared by encode and decoder. - position_embedding = PositionEmbedding( + ) + self.encoder_transformer_layers.append(layer) + self.decoder_position_embedding = PositionEmbedding( initializer=bart_kernel_initializer(), sequence_length=max_sequence_length, name="decoder_position_embedding", - )(token_embedding) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.Add(name="decoder_embeddings_add")( - (token_embedding, position_embedding) ) - x = keras.layers.LayerNormalization( + self.decoder_embeddings_add = keras.layers.Add( + name="decoder_embeddings_add", + ) + self.decoder_embeddings_layer_norm = keras.layers.LayerNormalization( name="decoder_embeddings_layer_norm", axis=-1, epsilon=1e-5, dtype="float32", - )(x) - x = keras.layers.Dropout( + ) + self.decoder_embeddings_dropout = keras.layers.Dropout( dropout, name="decoder_embeddings_dropout", - )(x) - - # Apply successive transformer decoder blocks. + ) + self.decoder_transformer_layers = [] for i in range(num_layers): - transformer_decoder_layer = TransformerDecoder( + layer = TransformerDecoder( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, @@ -203,22 +168,51 @@ def __init__( kernel_initializer=bart_kernel_initializer(), name=f"transformer_decoder_layer_{i}", ) - x = transformer_decoder_layer( + self.decoder_transformer_layers.append(layer) + + # === Functional Model === + encoder_token_id_input = keras.Input( + shape=(None,), dtype="int32", name="encoder_token_ids" + ) + encoder_padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="encoder_padding_mask" + ) + decoder_token_id_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_token_ids" + ) + decoder_padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_padding_mask" + ) + # Encoder. + tokens = self.token_embedding(encoder_token_id_input) + positions = self.encoder_position_embedding(tokens) + x = self.encoder_embeddings_add((tokens, positions)) + x = self.encoder_embeddings_layer_norm(x) + x = self.encoder_embeddings_dropout(x) + for transformer_layer in self.encoder_transformer_layers: + x = transformer_layer(x, padding_mask=encoder_padding_mask_input) + encoder_output = x + # Decoder. + tokens = self.token_embedding(decoder_token_id_input) + positions = self.decoder_position_embedding(tokens) + x = self.decoder_embeddings_add((tokens, positions)) + x = self.decoder_embeddings_layer_norm(x) + x = self.decoder_embeddings_dropout(x) + for transformer_layer in self.decoder_transformer_layers: + x = transformer_layer( decoder_sequence=x, encoder_sequence=encoder_output, - decoder_padding_mask=decoder_padding_mask, - encoder_padding_mask=encoder_padding_mask, + decoder_padding_mask=decoder_padding_mask_input, + encoder_padding_mask=encoder_padding_mask_input, ) - decoder_output = x - # Instantiate using Functional API Model constructor super().__init__( inputs={ "encoder_token_ids": encoder_token_id_input, - "encoder_padding_mask": encoder_padding_mask, + "encoder_padding_mask": encoder_padding_mask_input, "decoder_token_ids": decoder_token_id_input, - "decoder_padding_mask": decoder_padding_mask, + "decoder_padding_mask": decoder_padding_mask_input, }, outputs={ "encoder_sequence_output": encoder_output, @@ -227,7 +221,7 @@ def __init__( **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -235,7 +229,6 @@ def __init__( self.intermediate_dim = intermediate_dim self.dropout = dropout self.max_sequence_length = max_sequence_length - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm.py b/keras_nlp/models/bart/bart_seq_2_seq_lm.py index 2131519ce3..c17eafdb02 100644 --- a/keras_nlp/models/bart/bart_seq_2_seq_lm.py +++ b/keras_nlp/models/bart/bart_seq_2_seq_lm.py @@ -185,24 +185,21 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + + # === Functional Model === inputs = backbone.input hidden_states = backbone(inputs)["decoder_sequence_output"] outputs = backbone.token_embedding(hidden_states, reverse=True) - - # Instantiate using Functional API Model constructor. super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - self.backbone = backbone - self.preprocessor = preprocessor - self.generate_function = None - self._sampler = None - - # Default compilation + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(2e-5), @@ -280,33 +277,28 @@ def call_decoder_with_cache( cross-attention layer. """ # Embedding layers. - token_embedding = self.backbone.get_layer("token_embedding")( - decoder_token_ids + tokens = self.backbone.token_embedding(decoder_token_ids) + positions = self.backbone.decoder_position_embedding( + tokens, + start_index=self_attention_cache_update_index, ) - position_embedding = self.backbone.get_layer( - "decoder_position_embedding" - )(token_embedding, start_index=self_attention_cache_update_index) - # Sum, normalize and apply dropout to embeddings. - x = self.backbone.get_layer("decoder_embeddings_add")( - (token_embedding, position_embedding) - ) - x = self.backbone.get_layer("decoder_embeddings_layer_norm")(x) - x = self.backbone.get_layer("decoder_embeddings_dropout")(x) + x = self.backbone.decoder_embeddings_add((tokens, positions)) + x = self.backbone.decoder_embeddings_layer_norm(x) + x = self.backbone.decoder_embeddings_dropout(x) # Every decoder layer has a separate cache for the self-attention layer # and the cross-attention layer. We update all of them separately. self_attention_caches = [] cross_attention_caches = [] - for i in range(self.backbone.num_layers): + for i, layer in enumerate(self.backbone.decoder_transformer_layers): current_self_attention_cache = self_attention_cache[:, i, ...] current_cross_attention_cache = cross_attention_cache[:, i, ...] - ( x, next_self_attention_cache, next_cross_attention_cache, - ) = self.backbone.get_layer(f"transformer_decoder_layer_{i}")( + ) = layer( decoder_sequence=x, encoder_sequence=encoder_hidden_states, encoder_padding_mask=encoder_padding_mask, @@ -315,7 +307,6 @@ def call_decoder_with_cache( cross_attention_cache=current_cross_attention_cache, cross_attention_cache_update_index=cross_attention_cache_update_index, ) - if self_attention_cache_update_index is not None: self_attention_caches.append(next_self_attention_cache) if cross_attention_cache_update_index is not None: @@ -337,26 +328,13 @@ def call_decoder_with_cache( def call_encoder(self, token_ids, padding_mask): """Does a forward pass on the encoder and returns the encoder output.""" - - # Embedding layers. - token_embedding = self.backbone.get_layer("token_embedding")(token_ids) - position_embedding = self.backbone.get_layer( - "encoder_position_embedding" - )(token_embedding) - - # Sum, normalize and apply dropout to embeddings. - x = self.backbone.get_layer("encoder_embeddings_add")( - (token_embedding, position_embedding) - ) - x = self.backbone.get_layer("encoder_embeddings_layer_norm")(x) - x = self.backbone.get_layer("encoder_embeddings_dropout")(x) - - # Transformer encoder layers. - for i in range(self.backbone.num_layers): - x = self.backbone.get_layer(f"transformer_encoder_layer_{i}")( - x, padding_mask=padding_mask - ) - + tokens = self.backbone.token_embedding(token_ids) + positions = self.backbone.encoder_position_embedding(tokens) + x = self.backbone.decoder_embeddings_add((tokens, positions)) + x = self.backbone.encoder_embeddings_layer_norm(x) + x = self.backbone.encoder_embeddings_dropout(x) + for transformer_layer in self.backbone.encoder_transformer_layers: + x = transformer_layer(x, padding_mask=padding_mask) return x def _initialize_cache(self, encoder_token_ids, decoder_token_ids): diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py index 174b0f0e42..f511de3687 100644 --- a/keras_nlp/models/bert/bert_backbone.py +++ b/keras_nlp/models/bert/bert_backbone.py @@ -99,57 +99,40 @@ def __init__( num_segments=2, **kwargs, ): - # Index of classification token in the vocabulary - cls_token_index = 0 - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - segment_id_input = keras.Input( - shape=(None,), dtype="int32", name="segment_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens, positions, and segment ids. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=bert_kernel_initializer(), name="token_embedding", ) - token_embedding = token_embedding_layer(token_id_input) - position_embedding = PositionEmbedding( + self.position_embedding = PositionEmbedding( initializer=bert_kernel_initializer(), sequence_length=max_sequence_length, name="position_embedding", - )(token_embedding) - segment_embedding = keras.layers.Embedding( + ) + self.segment_embedding = keras.layers.Embedding( input_dim=num_segments, output_dim=hidden_dim, embeddings_initializer=bert_kernel_initializer(), name="segment_embedding", - )(segment_id_input) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.Add()( - (token_embedding, position_embedding, segment_embedding) ) - x = keras.layers.LayerNormalization( + self.embeddings_add = keras.layers.Add( + name="embeddings_add", + ) + self.embeddings_layer_norm = keras.layers.LayerNormalization( name="embeddings_layer_norm", axis=-1, epsilon=1e-12, dtype="float32", - )(x) - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = TransformerEncoder( + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation=gelu_approximate, @@ -157,24 +140,45 @@ def __init__( layer_norm_epsilon=1e-12, kernel_initializer=bert_kernel_initializer(), name=f"transformer_layer_{i}", - )(x, padding_mask=padding_mask) - - # Construct the two BERT outputs. The pooled output is a dense layer on - # top of the [CLS] token. - sequence_output = x - pooled_output = keras.layers.Dense( + ) + self.transformer_layers.append(layer) + self.pooled_dense = keras.layers.Dense( hidden_dim, kernel_initializer=bert_kernel_initializer(), activation="tanh", name="pooled_dense", - )(x[:, cls_token_index, :]) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + # Embed tokens, positions, and segment ids. + tokens = self.token_embedding(token_id_input) + positions = self.position_embedding(tokens) + segments = self.segment_embedding(segment_id_input) + # Sum, normalize and apply dropout to embeddings. + x = self.embeddings_add((tokens, positions, segments)) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, padding_mask=padding_mask_input) + # Construct the two BERT outputs. The pooled output is a dense layer on + # top of the [CLS] token. + sequence_output = x + cls_token_index = 0 + pooled_output = self.pooled_dense(x[:, cls_token_index, :]) super().__init__( inputs={ "token_ids": token_id_input, "segment_ids": segment_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs={ "sequence_output": sequence_output, @@ -183,7 +187,7 @@ def __init__( **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -193,7 +197,6 @@ def __init__( self.max_sequence_length = max_sequence_length self.num_segments = num_segments self.cls_token_index = cls_token_index - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/bert/bert_classifier.py b/keras_nlp/models/bert/bert_classifier.py index 2a9aa548bf..3ddb90a9d2 100644 --- a/keras_nlp/models/bert/bert_classifier.py +++ b/keras_nlp/models/bert/bert_classifier.py @@ -140,30 +140,37 @@ def __init__( dropout=0.1, **kwargs, ): - inputs = backbone.input - pooled = backbone(inputs)["pooled_output"] - pooled = keras.layers.Dropout(dropout)(pooled) - outputs = keras.layers.Dense( + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.output_dropout = keras.layers.Dropout( + dropout, + name="classifier_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=bert_kernel_initializer(), activation=activation, name="logits", - )(pooled) - # Instantiate using Functional API Model constructor + ) + + # === Functional Model === + inputs = backbone.input + pooled = backbone(inputs)["pooled_output"] + pooled = self.output_dropout(pooled) + outputs = self.output_dense(pooled) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + + # === Config === self.num_classes = num_classes self.activation = keras.activations.get(activation) self.dropout = dropout - # Default compilation + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/bert/bert_masked_lm.py b/keras_nlp/models/bert/bert_masked_lm.py index d4c12d1091..555c562f1f 100644 --- a/keras_nlp/models/bert/bert_masked_lm.py +++ b/keras_nlp/models/bert/bert_masked_lm.py @@ -101,6 +101,18 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation="gelu", + kernel_initializer=bert_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( @@ -108,22 +120,16 @@ def __init__( ), } backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation="gelu", - kernel_initializer=bert_kernel_initializer(), - name="mlm_head", - )(backbone_outputs["sequence_output"], inputs["mask_positions"]) - - # Instantiate using Functional API Model constructor + outputs = self.masked_lm_head( + backbone_outputs["sequence_output"], inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line + + # === Default compilation === self.backbone = backbone self.preprocessor = preprocessor self.compile( diff --git a/keras_nlp/models/bloom/bloom_backbone.py b/keras_nlp/models/bloom/bloom_backbone.py index 2fd18ed760..e65bf8ecb5 100644 --- a/keras_nlp/models/bloom/bloom_backbone.py +++ b/keras_nlp/models/bloom/bloom_backbone.py @@ -95,46 +95,55 @@ def __init__( max_sequence_length=512, **kwargs, ): - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=_bloom_kernel_initializer(stddev=0.02), tie_weights=False, name="token_embedding", ) - token_embedding = token_embedding_layer(token_ids) - - x = keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon, name="token_embedding_layernorm" - )(token_embedding) - + self.embeddings_layer_norm = keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon, + name="token_embedding_layernorm", + ) + self.transformer_layers = [] for i in range(num_layers): - x = BloomDecoder( + layer = BloomDecoder( num_heads=num_heads, intermediate_dim=intermediate_dim, dropout=dropout, layer_norm_epsilon=layer_norm_epsilon, name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - sequence_output = keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon, name="final_layernorm" - )(x) + ) + self.transformer_layers.append(layer) + self.layer_norm = keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon, + name="final_layernorm", + ) + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.token_embedding(token_id_input) + x = self.embeddings_layer_norm(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + sequence_output = self.layer_norm(x) super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=sequence_output, **kwargs, ) + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -143,7 +152,6 @@ def __init__( self.dropout = dropout self.layer_norm_epsilon = layer_norm_epsilon self.max_sequence_length = max_sequence_length - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py index aa5077ec67..87531aefcb 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py @@ -108,46 +108,32 @@ def __init__( bucket_size=256, **kwargs, ): - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=deberta_kernel_initializer(), name="token_embedding", ) - x = token_embedding_layer(token_id_input) - - # Normalize and apply dropout to embeddings. - x = keras.layers.LayerNormalization( + self.embeddings_layer_norm = keras.layers.LayerNormalization( epsilon=1e-7, dtype="float32", name="embeddings_layer_norm", - )(x) - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Relative embedding layer. - rel_embeddings = RelativeEmbedding( + ) + self.relative_embeddings = RelativeEmbedding( hidden_dim=hidden_dim, bucket_size=bucket_size, layer_norm_epsilon=1e-7, kernel_initializer=deberta_kernel_initializer(), name="rel_embedding", - )(x) - - # Apply successive DeBERTa encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = DisentangledAttentionEncoder( + layer = DisentangledAttentionEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, max_position_embeddings=max_sequence_length, @@ -157,22 +143,36 @@ def __init__( layer_norm_epsilon=1e-7, kernel_initializer=deberta_kernel_initializer(), name=f"disentangled_attention_encoder_layer_{i}", - )( + ) + self.transformer_layers.append(layer) + + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.token_embedding(token_id_input) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + rel_embeddings = self.relative_embeddings(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer( x, rel_embeddings=rel_embeddings, - padding_mask=padding_mask, + padding_mask=padding_mask_input, ) - - # Instantiate using Functional API Model constructor super().__init__( inputs={ "token_ids": token_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs=x, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -182,7 +182,6 @@ def __init__( self.max_sequence_length = max_sequence_length self.bucket_size = bucket_size self.start_token_index = 0 - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py index b03122064d..f5249cb34b 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py @@ -163,32 +163,44 @@ def __init__( dropout=0.0, **kwargs, ): - inputs = backbone.input + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.pooled_dropout = keras.layers.Dropout( + dropout, + name="pooled_dropout", + ) hidden_dim = hidden_dim or backbone.hidden_dim - - x = backbone(inputs)[:, backbone.start_token_index, :] - x = keras.layers.Dropout(dropout, name="pooled_dropout")(x) - x = keras.layers.Dense( + self.pooled_dense = keras.layers.Dense( hidden_dim, activation=keras.activations.gelu, name="pooled_dense", - )(x) - x = keras.layers.Dropout(backbone.dropout, name="classifier_dropout")(x) - outputs = keras.layers.Dense( + ) + self.output_dropout = keras.layers.Dropout( + backbone.dropout, + name="classifier_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=deberta_kernel_initializer(), activation=activation, name="logits", - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + inputs = backbone.input + x = backbone(inputs)[:, backbone.start_token_index, :] + x = self.pooled_dropout(x) + x = self.pooled_dense(x) + x = self.output_dropout(x) + outputs = self.output_dense(x) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line + + # === Config === self.backbone = backbone self.preprocessor = preprocessor self.num_classes = num_classes @@ -196,7 +208,7 @@ def __init__( self.hidden_dim = hidden_dim self.dropout = dropout - # Default compilation + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py index bf6a850a54..fadb4c0e24 100644 --- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py +++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py @@ -104,32 +104,33 @@ def __init__( preprocessor=None, **kwargs, ): - inputs = { - **backbone.input, - "mask_positions": keras.Input( - shape=(None,), dtype="int32", name="mask_positions" - ), - } - backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( vocabulary_size=backbone.vocabulary_size, token_embedding=backbone.token_embedding, intermediate_activation=keras.activations.gelu, kernel_initializer=deberta_kernel_initializer(), name="mlm_head", - )(backbone_outputs, inputs["mask_positions"]) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + inputs = { + **backbone.input, + "mask_positions": keras.Input( + shape=(None,), dtype="int32", name="mask_positions" + ), + } + x = backbone(backbone.input) + outputs = self.masked_lm_head(x, inputs["mask_positions"]) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/distil_bert/distil_bert_backbone.py b/keras_nlp/models/distil_bert/distil_bert_backbone.py index a3634215fa..f97e24f55d 100644 --- a/keras_nlp/models/distil_bert/distil_bert_backbone.py +++ b/keras_nlp/models/distil_bert/distil_bert_backbone.py @@ -100,39 +100,29 @@ def __init__( max_sequence_length=512, **kwargs, ): - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens and positions. - embedding_layer = TokenAndPositionEmbedding( + # === Layers === + self.embeddings = TokenAndPositionEmbedding( vocabulary_size=vocabulary_size, sequence_length=max_sequence_length, embedding_dim=hidden_dim, embeddings_initializer=distilbert_kernel_initializer(), name="token_and_position_embedding", ) - x = embedding_layer(token_id_input) - - # Normalize and apply dropout to embeddings. - x = keras.layers.LayerNormalization( + # Keep the token_embedding property for consistency across models. + self.token_embedding = self.embeddings.token_embedding + self.embeddings_layer_norm = keras.layers.LayerNormalization( axis=-1, epsilon=1e-12, dtype="float32", name="embeddings_layer_norm", - )(x) - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = TransformerEncoder( + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation="gelu", @@ -140,18 +130,31 @@ def __init__( layer_norm_epsilon=1e-12, kernel_initializer=distilbert_kernel_initializer(), name=f"transformer_layer_{i}", - )(x, padding_mask=padding_mask) + ) + self.transformer_layers.append(layer) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.embeddings(token_id_input) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, padding_mask=padding_mask_input) super().__init__( inputs={ "token_ids": token_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs=x, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -160,7 +163,6 @@ def __init__( self.dropout = dropout self.max_sequence_length = max_sequence_length self.cls_token_index = 0 - self.token_embedding = embedding_layer.token_embedding def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py index 42de1cee83..cf0db9786a 100644 --- a/keras_nlp/models/distil_bert/distil_bert_classifier.py +++ b/keras_nlp/models/distil_bert/distil_bert_classifier.py @@ -150,39 +150,46 @@ def __init__( dropout=0.2, **kwargs, ): - inputs = backbone.input + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor hidden_dim = hidden_dim or backbone.hidden_dim - - x = backbone(inputs)[:, backbone.cls_token_index, :] - x = keras.layers.Dense( + self.pooled_dense = keras.layers.Dense( hidden_dim, activation="relu", kernel_initializer=distilbert_kernel_initializer(), name="pooled_dense", - )(x) - x = keras.layers.Dropout(dropout, name="classifier_dropout")(x) - outputs = keras.layers.Dense( + ) + self.output_dropout = keras.layers.Dropout( + dropout, + name="output_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=distilbert_kernel_initializer(), activation=activation, name="logits", - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + inputs = backbone.input + x = backbone(inputs)[:, backbone.cls_token_index, :] + x = self.pooled_dense(x) + x = self.output_dropout(x) + outputs = self.output_dense(x) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + + # === Config === self.num_classes = num_classes self.activation = keras.activations.get(activation) self.hidden_dim = hidden_dim self.dropout = dropout + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py index 71cb117d5b..9be43f8aa1 100644 --- a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py +++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py @@ -104,6 +104,18 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation="gelu", + kernel_initializer=distilbert_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( @@ -111,25 +123,16 @@ def __init__( ), } backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation="gelu", - kernel_initializer=distilbert_kernel_initializer(), - name="mlm_head", - )(backbone_outputs, inputs["mask_positions"]) - - # Instantiate using Functional API Model constructor + outputs = self.masked_lm_head( + backbone_outputs, inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/electra/electra_backbone.py b/keras_nlp/models/electra/electra_backbone.py index 66d1db8ccc..2e2a0197af 100644 --- a/keras_nlp/models/electra/electra_backbone.py +++ b/keras_nlp/models/electra/electra_backbone.py @@ -94,65 +94,44 @@ def __init__( num_segments=2, **kwargs, ): - # Index of classification token in the vocabulary - cls_token_index = 0 - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - segment_id_input = keras.Input( - shape=(None,), dtype="int32", name="segment_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens, positions, and segment ids. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=electra_kernel_initializer(), name="token_embedding", ) - token_embedding = token_embedding_layer(token_id_input) - position_embedding = PositionEmbedding( + self.position_embedding = PositionEmbedding( initializer=electra_kernel_initializer(), sequence_length=max_sequence_length, name="position_embedding", - )(token_embedding) - segment_embedding = keras.layers.Embedding( + ) + self.segment_embedding = keras.layers.Embedding( input_dim=num_segments, output_dim=embedding_dim, embeddings_initializer=electra_kernel_initializer(), name="segment_embedding", - )(segment_id_input) - - # Add all embeddings together. - x = keras.layers.Add()( - (token_embedding, position_embedding, segment_embedding), ) - # Layer normalization - x = keras.layers.LayerNormalization( + self.embeddings_add = keras.layers.Add() + self.embeddings_layer_norm = keras.layers.LayerNormalization( name="embeddings_layer_norm", axis=-1, epsilon=1e-12, dtype="float32", - )(x) - # Dropout - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) + ) if hidden_dim != embedding_dim: - x = keras.layers.Dense( + self.embeddings_projection = keras.layers.Dense( hidden_dim, kernel_initializer=electra_kernel_initializer(), name="embeddings_projection", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = TransformerEncoder( + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation=gelu_approximate, @@ -160,24 +139,49 @@ def __init__( layer_norm_epsilon=1e-12, kernel_initializer=electra_kernel_initializer(), name=f"transformer_layer_{i}", - )(x, padding_mask=padding_mask) - - sequence_output = x - # Construct the two ELECTRA outputs. The pooled output is a dense layer on - # top of the [CLS] token. - pooled_output = keras.layers.Dense( + ) + self.transformer_layers.append(layer) + self.pooled_dense = keras.layers.Dense( hidden_dim, kernel_initializer=electra_kernel_initializer(), activation="tanh", name="pooled_dense", - )(x[:, cls_token_index, :]) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + # Embed tokens, positions, and segment ids. + tokens = self.token_embedding(token_id_input) + positions = self.position_embedding(tokens) + segments = self.segment_embedding(segment_id_input) + # Add all embeddings together. + x = self.embeddings_add((tokens, positions, segments)) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + if hidden_dim != embedding_dim: + x = self.embeddings_projection(x) + # Apply successive transformer encoder blocks. + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, padding_mask=padding_mask_input) + # Index of classification token in the vocabulary + cls_token_index = 0 + sequence_output = x + # Construct the two ELECTRA outputs. The pooled output is a dense layer on + # top of the [CLS] token. + pooled_output = self.pooled_dense(x[:, cls_token_index, :]) super().__init__( inputs={ "token_ids": token_id_input, "segment_ids": segment_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs={ "sequence_output": sequence_output, @@ -186,7 +190,7 @@ def __init__( **kwargs, ) - # All references to self below this line + # === Config === self.vocab_size = vocab_size self.num_layers = num_layers self.num_heads = num_heads @@ -197,7 +201,6 @@ def __init__( self.max_sequence_length = max_sequence_length self.num_segments = num_segments self.cls_token_index = cls_token_index - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/f_net/f_net_backbone.py b/keras_nlp/models/f_net/f_net_backbone.py index ac4d290b02..9103a10d48 100644 --- a/keras_nlp/models/f_net/f_net_backbone.py +++ b/keras_nlp/models/f_net/f_net_backbone.py @@ -101,61 +101,44 @@ def __init__( num_segments=4, **kwargs, ): - # Index of classification token in the vocabulary - cls_token_index = 0 - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - segment_id_input = keras.Input( - shape=(None,), dtype="int32", name="segment_ids" - ) - - # Embed tokens, positions, and segment ids. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=f_net_kernel_initializer(), name="token_embedding", ) - token_embedding = token_embedding_layer(token_id_input) - position_embedding = PositionEmbedding( + self.position_embedding = PositionEmbedding( initializer=f_net_kernel_initializer(), sequence_length=max_sequence_length, name="position_embedding", - )(token_embedding) - segment_embedding = keras.layers.Embedding( + ) + self.segment_embedding = keras.layers.Embedding( input_dim=num_segments, output_dim=hidden_dim, embeddings_initializer=f_net_kernel_initializer(), name="segment_embedding", - )(segment_id_input) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.Add()( - (token_embedding, position_embedding, segment_embedding) ) - x = keras.layers.LayerNormalization( + self.embeddings_add = keras.layers.Add() + self.embeddings_layer_norm = keras.layers.LayerNormalization( name="embeddings_layer_norm", axis=-1, epsilon=1e-12, dtype="float32", - )(x) - - x = keras.layers.Dense( + ) + self.embedding_projection = keras.layers.Dense( hidden_dim, kernel_initializer=f_net_kernel_initializer(), bias_initializer=f_net_bias_initializer(), name="embedding_projection", - )(x) - x = keras.layers.Dropout( + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Apply successive FNet encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = FNetEncoder( + layer = FNetEncoder( intermediate_dim=intermediate_dim, activation=gelu_approximate, dropout=dropout, @@ -163,19 +146,41 @@ def __init__( kernel_initializer=f_net_kernel_initializer(), bias_initializer=f_net_bias_initializer(), name=f"f_net_layer_{i}", - )(x) - - # Construct the two FNet outputs. The pooled output is a dense layer on - # top of the [CLS] token. - sequence_output = x - pooled_output = keras.layers.Dense( + ) + self.transformer_layers.append(layer) + self.pooled_dense = keras.layers.Dense( hidden_dim, kernel_initializer=f_net_kernel_initializer(), bias_initializer=f_net_bias_initializer(), activation="tanh", name="pooled_dense", - )(x[:, cls_token_index, :]) + ) + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + # Embed tokens, positions, and segment ids. + tokens = self.token_embedding(token_id_input) + positions = self.position_embedding(tokens) + segments = self.segment_embedding(segment_id_input) + # Sum, normalize and apply dropout to embeddings. + x = self.embeddings_add((tokens, positions, segments)) + x = self.embeddings_layer_norm(x) + x = self.embedding_projection(x) + x = self.embeddings_dropout(x) + # Apply successive FNet encoder blocks. + for transformer_layer in self.transformer_layers: + x = transformer_layer(x) + # Index of classification token in the vocabulary + cls_token_index = 0 + # Construct the two FNet outputs. The pooled output is a dense layer on + # top of the [CLS] token. + sequence_output = x + pooled_output = self.pooled_dense(x[:, cls_token_index, :]) # Instantiate using Functional API Model constructor super().__init__( inputs={ @@ -189,7 +194,7 @@ def __init__( **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.hidden_dim = hidden_dim @@ -198,7 +203,6 @@ def __init__( self.max_sequence_length = max_sequence_length self.num_segments = num_segments self.cls_token_index = cls_token_index - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/f_net/f_net_classifier.py b/keras_nlp/models/f_net/f_net_classifier.py index f6485485e1..f4ee31d1e8 100644 --- a/keras_nlp/models/f_net/f_net_classifier.py +++ b/keras_nlp/models/f_net/f_net_classifier.py @@ -109,29 +109,37 @@ def __init__( dropout=0.1, **kwargs, ): - inputs = backbone.input - pooled = backbone(inputs)["pooled_output"] - pooled = keras.layers.Dropout(dropout)(pooled) - outputs = keras.layers.Dense( + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.output_dropout = keras.layers.Dropout( + dropout, + name="output_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=f_net_kernel_initializer(), activation=activation, name="logits", - )(pooled) - # Instantiate using Functional API Model constructor + ) + + # === Functional Model === + inputs = backbone.input + pooled = backbone(inputs)["pooled_output"] + pooled = self.output_dropout(pooled) + outputs = self.output_dense(pooled) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + + # === Config === self.num_classes = num_classes self.activation = keras.activations.get(activation) self.dropout = dropout + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/f_net/f_net_masked_lm.py b/keras_nlp/models/f_net/f_net_masked_lm.py index d7048cd525..c0eb231d78 100644 --- a/keras_nlp/models/f_net/f_net_masked_lm.py +++ b/keras_nlp/models/f_net/f_net_masked_lm.py @@ -101,6 +101,18 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation="gelu", + kernel_initializer=f_net_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( @@ -108,24 +120,16 @@ def __init__( ), } backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation="gelu", - kernel_initializer=f_net_kernel_initializer(), - name="mlm_head", - )(backbone_outputs["sequence_output"], inputs["mask_positions"]) - - # Instantiate using Functional API Model constructor + outputs = self.masked_lm_head( + backbone_outputs["sequence_output"], inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py index 89c23f71de..d3f4a41541 100644 --- a/keras_nlp/models/gpt2/gpt2_backbone.py +++ b/keras_nlp/models/gpt2/gpt2_backbone.py @@ -97,68 +97,73 @@ def __init__( max_sequence_length=1024, **kwargs, ): - # Inputs - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens, positions. - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=_gpt_2_kernel_initializer(stddev=0.01), name="token_embedding", ) - token_embedding = token_embedding_layer(token_ids) - - # Can't use `TokenAndPositionEmbedding` layer here because of different - # initializers. - position_embedding = PositionEmbedding( + self.position_embedding = PositionEmbedding( initializer=_gpt_2_kernel_initializer(stddev=0.02), sequence_length=max_sequence_length, name="position_embedding", - )(token_embedding) - - # Sum and apply dropout to embeddings. - x = keras.layers.Add(name="embeddings_add")( - (token_embedding, position_embedding) ) - x = keras.layers.Dropout( + self.embeddings_add = keras.layers.Add( + name="embeddings_add", + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Apply successive transformer decoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = TransformerDecoder( - intermediate_dim=intermediate_dim, - num_heads=num_heads, - dropout=dropout, - layer_norm_epsilon=1e-05, - activation=gelu_approximate, - kernel_initializer=_gpt_2_kernel_initializer(stddev=0.02), - normalize_first=True, - name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - sequence_output = keras.layers.LayerNormalization( + self.transformer_layers.append( + TransformerDecoder( + intermediate_dim=intermediate_dim, + num_heads=num_heads, + dropout=dropout, + layer_norm_epsilon=1e-05, + activation=gelu_approximate, + kernel_initializer=_gpt_2_kernel_initializer(stddev=0.02), + normalize_first=True, + name=f"transformer_layer_{i}", + ) + ) + self.layer_norm = keras.layers.LayerNormalization( name="layer_norm", axis=-1, epsilon=1e-05, dtype="float32", - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + # Embed inputs. + tokens = self.token_embedding(token_id_input) + positions = self.position_embedding(tokens) + x = self.embeddings_add((tokens, positions)) + x = self.embeddings_dropout(x) + # Apply transformer layers. + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + sequence_output = self.layer_norm(x) + # Instantiate using the Functional constructor. super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=sequence_output, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -166,7 +171,6 @@ def __init__( self.intermediate_dim = intermediate_dim self.dropout = dropout self.max_sequence_length = max_sequence_length - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py index 44eebd0a20..e154c88bb1 100644 --- a/keras_nlp/models/gpt2/gpt2_causal_lm.py +++ b/keras_nlp/models/gpt2/gpt2_causal_lm.py @@ -155,23 +155,21 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + + # === Functional Model === inputs = backbone.input hidden_states = backbone(inputs) outputs = backbone.token_embedding(hidden_states, reverse=True) - - # Instantiate using Functional API Model constructor. super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - self.backbone = backbone - self.preprocessor = preprocessor - self.generate_function = None - self._sampler = None - # Default compilation + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(2e-5), @@ -216,27 +214,25 @@ def call_with_cache( the final hidden representation of the input tokens, and `cache` is the decoding cache. """ - token_embedding = self.backbone.get_layer("token_embedding")(token_ids) - position_embedding = self.backbone.get_layer("position_embedding")( - token_embedding, start_index=cache_update_index - ) - x = self.backbone.get_layer("embeddings_add")( - (token_embedding, position_embedding) + tokens = self.backbone.token_embedding(token_ids) + positions = self.backbone.position_embedding( + tokens, start_index=cache_update_index ) - x = self.backbone.get_layer("embeddings_dropout")(x) + x = self.backbone.embeddings_add((tokens, positions)) + x = self.backbone.embeddings_dropout(x) # Each decoder layer has a cache; we update them separately. caches = [] - for i in range(self.backbone.num_layers): + for i, transformer_layer in enumerate(self.backbone.transformer_layers): current_cache = cache[:, i, ...] - x, next_cache = self.backbone.get_layer(f"transformer_layer_{i}")( + x, next_cache = transformer_layer( x, self_attention_cache=current_cache, self_attention_cache_update_index=cache_update_index, ) caches.append(next_cache) cache = ops.stack(caches, axis=1) - hidden_states = x = self.backbone.get_layer("layer_norm")(x) - logits = self.backbone.get_layer("token_embedding")(x, reverse=True) + hidden_states = x = self.backbone.layer_norm(x) + logits = self.backbone.token_embedding(x, reverse=True) return logits, hidden_states, cache def _build_cache(self, token_ids): diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py index 6804331aed..5bbc11af70 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_backbone.py @@ -77,29 +77,20 @@ def __init__( max_sequence_length=512, **kwargs, ): - # Inputs - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens - token_embedding_layer = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=_gpt_neo_x_kernel_initializer(stddev=0.01), name="token_embedding", ) - token_embedding = token_embedding_layer(token_ids) - - x = keras.layers.Dropout( + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(token_embedding) - - # Apply successive transformer decoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = GPTNeoXDecoder( + layer = GPTNeoXDecoder( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, @@ -110,25 +101,38 @@ def __init__( activation=gelu_approximate, kernel_initializer=_gpt_neo_x_kernel_initializer(stddev=0.02), name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - sequence_output = keras.layers.LayerNormalization( + ) + self.transformer_layers.append(layer) + self.layer_norm = keras.layers.LayerNormalization( name="layer_norm", axis=-1, epsilon=layer_norm_epsilon, dtype="float32", - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + # Embed tokens. + x = self.token_embedding(token_id_input) + x = self.embeddings_dropout(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + sequence_output = self.layer_norm(x) super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=sequence_output, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -139,7 +143,6 @@ def __init__( self.rotary_max_wavelength = rotary_max_wavelength self.max_sequence_length = max_sequence_length self.layer_norm_epsilon = layer_norm_epsilon - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py index 0f813470aa..bef32017ea 100644 --- a/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py +++ b/keras_nlp/models/gpt_neo_x/gpt_neo_x_causal_lm.py @@ -52,23 +52,21 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + + # === Functional Model === inputs = backbone.input hidden_states = backbone(inputs) outputs = backbone.token_embedding(hidden_states, reverse=True) - - # Instantiate using Functional API Model constructor. super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - self.backbone = backbone - self.preprocessor = preprocessor - self.generate_function = None - self._sampler = None - # Default compilation + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(2e-5), @@ -109,20 +107,20 @@ def call_with_cache( the final hidden representation of the input tokens, and `cache` is the decoding cache. """ - token_embedding = self.backbone.get_layer("token_embedding")(token_ids) - x = self.backbone.get_layer("embeddings_dropout")(token_embedding) + token_embedding = self.backbone.token_embedding(token_ids) + x = self.backbone.embeddings_dropout(token_embedding) # Each decoder layer has a cache; we update them separately. caches = [] - for i in range(self.backbone.num_layers): + for i, transformer_layer in enumerate(self.backbone.transformer_layers): current_cache = cache[:, i, ...] - x, next_cache = self.backbone.get_layer(f"transformer_layer_{i}")( + x, next_cache = transformer_layer( x, self_attention_cache=current_cache, self_attention_cache_update_index=cache_update_index, ) caches.append(next_cache) cache = ops.stack(caches, axis=1) - x = self.backbone.get_layer("layer_norm")(x) + x = self.backbone.layer_norm(x) hidden_states = x logits = self.backbone.token_embedding(hidden_states, reverse=True) return logits, hidden_states, cache diff --git a/keras_nlp/models/llama/llama_backbone.py b/keras_nlp/models/llama/llama_backbone.py index 63438544cc..46cfdc37f2 100644 --- a/keras_nlp/models/llama/llama_backbone.py +++ b/keras_nlp/models/llama/llama_backbone.py @@ -75,26 +75,17 @@ def __init__( max_sequence_length=4096, **kwargs, ): - # Inputs - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens - token_embedding = ReversibleEmbedding( + # === Layers === + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, embeddings_initializer=_llama_kernel_initializer(stddev=0.01), tie_weights=False, name="token_embedding", - )(token_ids) - - x = token_embedding - - # Apply successive transformer decoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = LlamaDecoder( + layer = LlamaDecoder( intermediate_dim=intermediate_dim, num_query_heads=num_query_heads, num_key_value_heads=num_key_value_heads, @@ -105,23 +96,34 @@ def __init__( activation=ops.silu, kernel_initializer=_llama_kernel_initializer(stddev=0.02), name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - sequence_output = LlamaLayerNorm( + ) + self.transformer_layers.append(layer) + self.layer_norm = LlamaLayerNorm( name="layer_norm", epsilon=layer_norm_epsilon, - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.token_embedding(token_id_input) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + sequence_output = self.layer_norm(x) super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=sequence_output, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_query_heads = num_query_heads @@ -150,7 +152,3 @@ def get_config(self): } ) return config - - @property - def token_embedding(self): - return self.get_layer("token_embedding") diff --git a/keras_nlp/models/mistral/mistral_backbone.py b/keras_nlp/models/mistral/mistral_backbone.py index 42cec8b218..107e5699cb 100644 --- a/keras_nlp/models/mistral/mistral_backbone.py +++ b/keras_nlp/models/mistral/mistral_backbone.py @@ -109,17 +109,9 @@ def __init__( dropout=0, **kwargs, ): - # Get the dtype + # === Layers === dtype = kwargs.pop("dtype", keras.backend.floatx()) - - # Inputs - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed Tokens - token_embedding_layer = ReversibleEmbedding( + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, tie_weights=False, @@ -127,11 +119,9 @@ def __init__( dtype=dtype, name="token_embedding", ) - x = token_embedding_layer(token_ids) - - # Apply successive transformer decoder blocks + self.transformer_layers = [] for i in range(num_layers): - x = MistralTransformerDecoder( + layer = MistralTransformerDecoder( intermediate_dim=intermediate_dim, num_query_heads=num_query_heads, num_key_value_heads=num_key_value_heads, @@ -144,25 +134,35 @@ def __init__( dropout=dropout, dtype=dtype, name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - sequence_output = MistralLayerNormalization( - name="sequence_output_layernorm", + ) + self.transformer_layers.append(layer) + self.layer_norm = MistralLayerNormalization( epsilon=layer_norm_epsilon, dtype=dtype, - )(x) + name="sequence_output_layernorm", + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.token_embedding(token_id_input) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + sequence_output = self.layer_norm(x) super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=sequence_output, **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_query_heads = num_query_heads @@ -174,7 +174,6 @@ def __init__( self.sliding_window = sliding_window self.layer_norm_epsilon = layer_norm_epsilon self.dropout = dropout - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py index ff1495ba9f..d04f4b571d 100644 --- a/keras_nlp/models/opt/opt_backbone.py +++ b/keras_nlp/models/opt/opt_backbone.py @@ -93,25 +93,18 @@ def __init__( max_sequence_length=2048, **kwargs, ): - # Decoder inputs. - token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens and positions. - embedding_layer = TokenAndPositionEmbedding( + # === Layers === + self.embeddings = TokenAndPositionEmbedding( vocabulary_size=vocabulary_size, sequence_length=max_sequence_length, embedding_dim=hidden_dim, embeddings_initializer=opt_kernel_initializer(), name="embeddings", ) - x = embedding_layer(token_ids) - - # Apply successive transformer decoder blocks. + self.token_embedding = self.embeddings.token_embedding + self.transformer_layers = [] for i in range(num_layers): - x = TransformerDecoder( + layer = TransformerDecoder( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, @@ -120,27 +113,36 @@ def __init__( normalize_first=True, kernel_initializer=opt_kernel_initializer(), name=f"transformer_layer_{i}", - )(x, decoder_padding_mask=padding_mask) - - # Add a final layer norm. - x = keras.layers.LayerNormalization( - name="layer_norm", + ) + self.transformer_layers.append(layer) + self.layer_norm = keras.layers.LayerNormalization( axis=-1, epsilon=1e-5, dtype="float32", - )(x) + name="layer_norm", + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.embeddings(token_id_input) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, decoder_padding_mask=padding_mask_input) + x = self.layer_norm(x) super().__init__( inputs={ - "token_ids": token_ids, - "padding_mask": padding_mask, + "token_ids": token_id_input, + "padding_mask": padding_mask_input, }, outputs=x, **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -148,7 +150,6 @@ def __init__( self.intermediate_dim = intermediate_dim self.dropout = dropout self.max_sequence_length = max_sequence_length - self.token_embedding = embedding_layer.token_embedding def get_config(self): return { diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py index 6197a87ffd..9715bc6b75 100644 --- a/keras_nlp/models/opt/opt_causal_lm.py +++ b/keras_nlp/models/opt/opt_causal_lm.py @@ -155,23 +155,21 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + + # === Functional Model === inputs = backbone.input hidden_states = backbone(inputs) outputs = backbone.token_embedding(hidden_states, reverse=True) - - # Instantiate using Functional API Model constructor. super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - self.backbone = backbone - self.preprocessor = preprocessor - self.generate_function = None - self._sampler = None - # Default compilation + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(2e-5), @@ -216,21 +214,19 @@ def call_with_cache( the final hidden representation of the input tokens, and `cache` is the decoding cache. """ - x = self.backbone.get_layer("embeddings")( - token_ids, start_index=cache_update_index - ) + x = self.backbone.embeddings(token_ids, start_index=cache_update_index) # Each decoder layer has a cache; we update them separately. caches = [] - for i in range(self.backbone.num_layers): + for i, transformer_layer in enumerate(self.backbone.transformer_layers): current_cache = cache[:, i, ...] - x, next_cache = self.backbone.get_layer(f"transformer_layer_{i}")( + x, next_cache = transformer_layer( x, self_attention_cache=current_cache, self_attention_cache_update_index=cache_update_index, ) caches.append(next_cache) cache = ops.stack(caches, axis=1) - x = self.backbone.get_layer("layer_norm")(x) + x = self.backbone.layer_norm(x) hidden_states = x logits = self.backbone.token_embedding(hidden_states, reverse=True) return logits, hidden_states, cache diff --git a/keras_nlp/models/roberta/roberta_backbone.py b/keras_nlp/models/roberta/roberta_backbone.py index 8495b5cb69..614104d8d7 100644 --- a/keras_nlp/models/roberta/roberta_backbone.py +++ b/keras_nlp/models/roberta/roberta_backbone.py @@ -98,39 +98,28 @@ def __init__( max_sequence_length=512, **kwargs, ): - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - - # Embed tokens and positions. - embedding_layer = TokenAndPositionEmbedding( + # === Layers === + self.embeddings = TokenAndPositionEmbedding( vocabulary_size=vocabulary_size, sequence_length=max_sequence_length, embedding_dim=hidden_dim, embeddings_initializer=roberta_kernel_initializer(), name="embeddings", ) - embedding = embedding_layer(token_id_input) - - # Sum, normalize and apply dropout to embeddings. - x = keras.layers.LayerNormalization( - name="embeddings_layer_norm", + self.token_embedding = self.embeddings.token_embedding + self.embeddings_layer_norm = keras.layers.LayerNormalization( axis=-1, epsilon=1e-5, # Original paper uses this epsilon value dtype="float32", - )(embedding) - x = keras.layers.Dropout( + name="embeddings_layer_norm", + ) + self.embeddings_dropout = keras.layers.Dropout( dropout, name="embeddings_dropout", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.transformer_layers = [] for i in range(num_layers): - x = TransformerEncoder( + layer = TransformerEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation="gelu", @@ -138,18 +127,31 @@ def __init__( layer_norm_epsilon=1e-5, kernel_initializer=roberta_kernel_initializer(), name=f"transformer_layer_{i}", - )(x, padding_mask=padding_mask) + ) + self.transformer_layers.append(layer) - # Instantiate using Functional API Model constructor + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + x = self.embeddings(token_id_input) + x = self.embeddings_layer_norm(x) + x = self.embeddings_dropout(x) + for transformer_layer in self.transformer_layers: + x = transformer_layer(x, padding_mask=padding_mask_input) super().__init__( inputs={ "token_ids": token_id_input, - "padding_mask": padding_mask, + "padding_mask": padding_mask_input, }, outputs=x, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -158,7 +160,6 @@ def __init__( self.dropout = dropout self.max_sequence_length = max_sequence_length self.start_token_index = 0 - self.token_embedding = embedding_layer.token_embedding def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/roberta/roberta_classifier.py b/keras_nlp/models/roberta/roberta_classifier.py index 9098d95429..e3d7666f5b 100644 --- a/keras_nlp/models/roberta/roberta_classifier.py +++ b/keras_nlp/models/roberta/roberta_classifier.py @@ -144,38 +144,50 @@ def __init__( dropout=0.0, **kwargs, ): - inputs = backbone.input + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.pooled_dropout = keras.layers.Dropout( + dropout, + name="pooled_dropout", + ) hidden_dim = hidden_dim or backbone.hidden_dim - - x = backbone(inputs)[:, backbone.start_token_index, :] - x = keras.layers.Dropout(dropout, name="pooled_dropout")(x) - x = keras.layers.Dense( - hidden_dim, activation="tanh", name="pooled_dense" - )(x) - x = keras.layers.Dropout(dropout, name="classifier_dropout")(x) - outputs = keras.layers.Dense( + self.pooled_dense = keras.layers.Dense( + hidden_dim, + activation="tanh", + name="pooled_dense", + ) + self.output_dropout = keras.layers.Dropout( + dropout, + name="output_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=roberta_kernel_initializer(), activation=activation, name="logits", - )(x) + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + inputs = backbone.input + x = backbone(inputs)[:, backbone.start_token_index, :] + x = self.pooled_dropout(x) + x = self.pooled_dense(x) + x = self.output_dropout(x) + outputs = self.output_dense(x) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + + # === Config === self.num_classes = num_classes self.activation = keras.activations.get(activation) self.hidden_dim = hidden_dim self.dropout = dropout - # Default compilation + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( diff --git a/keras_nlp/models/roberta/roberta_masked_lm.py b/keras_nlp/models/roberta/roberta_masked_lm.py index 1517f25914..0e62f4cff6 100644 --- a/keras_nlp/models/roberta/roberta_masked_lm.py +++ b/keras_nlp/models/roberta/roberta_masked_lm.py @@ -103,6 +103,18 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation="gelu", + kernel_initializer=roberta_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( @@ -110,25 +122,16 @@ def __init__( ), } backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation="gelu", - kernel_initializer=roberta_kernel_initializer(), - name="mlm_head", - )(backbone_outputs, inputs["mask_positions"]) - - # Instantiate using Functional API Model constructor + outputs = self.masked_lm_head( + backbone_outputs, inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/t5/t5_backbone.py b/keras_nlp/models/t5/t5_backbone.py index 6e76094d71..3fdf69ff30 100644 --- a/keras_nlp/models/t5/t5_backbone.py +++ b/keras_nlp/models/t5/t5_backbone.py @@ -85,45 +85,21 @@ def __init__( tie_embedding_weights=True, **kwargs, ): - # Encoder inputs - encoder_token_ids = keras.Input( - shape=(None,), dtype="int32", name="encoder_token_ids" - ) - encoder_padding_mask = keras.Input( - shape=(None,), dtype="int32", name="encoder_padding_mask" - ) - - # Decoder inputs. - decoder_token_ids = keras.Input( - shape=(None,), dtype="int32", name="decoder_token_ids" - ) - decoder_padding_mask = keras.Input( - shape=(None,), dtype="int32", name="decoder_padding_mask" - ) - # Token embedding layer. This layer is shared by encoder and decoder. - token_embedding_layer = ReversibleEmbedding( + self.token_embedding = ReversibleEmbedding( input_dim=vocabulary_size, output_dim=hidden_dim, tie_weights=tie_embedding_weights, embeddings_initializer=keras.initializers.TruncatedNormal(1.0), name="token_embedding", ) - - # ===== Encoder ===== - - # Embed tokens. - token_embedding = token_embedding_layer(encoder_token_ids) - x = keras.layers.Dropout( + self.encoder_embedding_dropout = keras.layers.Dropout( dropout, name="encoder_embedding_dropout", - )(token_embedding) - - encoder_attention_mask = encoder_padding_mask[:, None, :] - - position_bias = None + ) + self.encoder_transformer_layers = [] for i in range(num_layers): - output = T5TransformerLayer( + layer = T5TransformerLayer( is_decoder=False, hidden_dim=hidden_dim, intermediate_dim=intermediate_dim, @@ -135,39 +111,23 @@ def __init__( use_gated_activation=use_gated_activation, use_relative_attention_bias=bool(i == 0), name=f"transformer_encoder_layer_{i}", - )( - x, - attention_mask=encoder_attention_mask, - position_bias=position_bias, - use_causal_mask=False, ) - if isinstance(output, tuple): - x, position_bias = output - - x = T5LayerNorm( + self.encoder_transformer_layers.append(layer) + self.encoder_layer_norm = T5LayerNorm( epsilon=layer_norm_epsilon, name="encoder_output_layer_norm", - )(x) - x = keras.layers.Dropout( + ) + self.encoder_dropout = keras.layers.Dropout( dropout, name="encoder_output_dropout", - )(x) - encoder_output = x - - # ===== Decoder ===== - - # Embed tokens. - token_embedding = token_embedding_layer(decoder_token_ids) - x = keras.layers.Dropout( + ) + self.decoder_embedding_dropout = keras.layers.Dropout( dropout, name="decoder_embedding_dropout", - )(token_embedding) - - decoder_attention_mask = decoder_padding_mask[:, None, :] - - position_bias = None + ) + self.decoder_transformer_layers = [] for i in range(num_layers): - output = T5TransformerLayer( + layer = T5TransformerLayer( is_decoder=True, hidden_dim=hidden_dim, intermediate_dim=intermediate_dim, @@ -179,7 +139,54 @@ def __init__( use_gated_activation=use_gated_activation, use_relative_attention_bias=bool(i == 0), name=f"transformer_decoder_layer_{i}", - )( + ) + self.decoder_transformer_layers.append(layer) + self.decoder_layer_norm = T5LayerNorm( + epsilon=layer_norm_epsilon, + name="decoder_output_layer_norm", + ) + self.decoder_dropout = keras.layers.Dropout( + dropout, + name="decoder_output_dropout", + ) + + # === Functional Model === + encoder_token_id_input = keras.Input( + shape=(None,), dtype="int32", name="encoder_token_ids" + ) + encoder_padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="encoder_padding_mask" + ) + decoder_token_id_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_token_ids" + ) + decoder_padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_padding_mask" + ) + # Encoder. + x = self.token_embedding(encoder_token_id_input) + x = self.encoder_embedding_dropout(x) + encoder_attention_mask = encoder_padding_mask_input[:, None, :] + position_bias = None + for transformer_layer in self.encoder_transformer_layers: + output = transformer_layer( + x, + attention_mask=encoder_attention_mask, + position_bias=position_bias, + use_causal_mask=False, + ) + if isinstance(output, tuple): + x, position_bias = output + x = self.encoder_layer_norm(x) + x = self.encoder_dropout(x) + encoder_output = x + # Decoder. + x = self.token_embedding(decoder_token_id_input) + x = self.decoder_embedding_dropout(x) + decoder_attention_mask = decoder_padding_mask_input[:, None, :] + position_bias = None + for transformer_layer in self.decoder_transformer_layers: + output = transformer_layer( x, attention_mask=decoder_attention_mask, position_bias=position_bias, @@ -189,23 +196,15 @@ def __init__( ) if isinstance(output, tuple): x, position_bias = output - - x = T5LayerNorm( - epsilon=layer_norm_epsilon, - name="decoder_output_layer_norm", - )(x) - x = keras.layers.Dropout( - dropout, - name="decoder_output_dropout", - )(x) + x = self.decoder_layer_norm(x) + x = self.decoder_dropout(x) decoder_output = x - super().__init__( { - "encoder_token_ids": encoder_token_ids, - "encoder_padding_mask": encoder_padding_mask, - "decoder_token_ids": decoder_token_ids, - "decoder_padding_mask": decoder_padding_mask, + "encoder_token_ids": encoder_token_id_input, + "encoder_padding_mask": encoder_padding_mask_input, + "decoder_token_ids": decoder_token_id_input, + "decoder_padding_mask": decoder_padding_mask_input, }, outputs={ "encoder_sequence_output": encoder_output, @@ -213,7 +212,8 @@ def __init__( }, **kwargs, ) - # All references to `self` below this line + + # === Config === self.vocabulary_size = vocabulary_size self.hidden_dim = hidden_dim self.intermediate_dim = intermediate_dim @@ -225,7 +225,6 @@ def __init__( self.use_gated_activation = use_gated_activation self.layer_norm_epsilon = layer_norm_epsilon self.tie_embedding_weights = tie_embedding_weights - self.token_embedding = token_embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/task.py b/keras_nlp/models/task.py index ee28e3a984..1fe8d0b789 100644 --- a/keras_nlp/models/task.py +++ b/keras_nlp/models/task.py @@ -31,18 +31,25 @@ class Task(PipelineModel): """Base class for Task models.""" def __init__(self, *args, **kwargs): - self._backbone = None - self._preprocessor = None super().__init__(*args, **kwargs) self._functional_layer_ids = set( id(layer) for layer in self._flatten_layers() ) + self._initialized = True def __dir__(self): - # Temporary fixes for weight saving. This mimics the following PR for + if config.keras_3(): + return super().__dir__() + + # Temporary fixes for Keras 2 saving. This mimics the following PR for # older version of Keras: https://github.com/keras-team/keras/pull/18982 def filter_fn(attr): - if attr == "_layer_checkpoint_dependencies": + if attr in [ + "_layer_checkpoint_dependencies", + "transformer_layers", + "encoder_transformer_layers", + "decoder_transformer_layers", + ]: return False return id(getattr(self, attr)) not in self._functional_layer_ids @@ -99,17 +106,28 @@ def compile(self, optimizer="rmsprop", loss=None, **kwargs): super().compile(optimizer=optimizer, loss=loss, **kwargs) def preprocess_samples(self, x, y=None, sample_weight=None): - return self.preprocessor(x, y=y, sample_weight=sample_weight) + if self.preprocessor is not None: + return self.preprocessor(x, y=y, sample_weight=sample_weight) + else: + return super().preprocess_samples(x, y, sample_weight) def __setattr__(self, name, value): - # Work around torch setattr for properties. - if name in ["backbone", "preprocessor"]: + # Work around setattr issues for Keras 2 and Keras 3 torch backend. + # Since all our state is covered by functional model we can route + # around custom setattr calls. + is_property = isinstance(getattr(type(self), name, None), property) + is_unitialized = not hasattr(self, "_initialized") + is_torch = config.backend() == "torch" + is_keras_2 = not config.keras_3() + if is_torch and (is_property or is_unitialized): + return object.__setattr__(self, name, value) + if is_keras_2 and is_unitialized: return object.__setattr__(self, name, value) return super().__setattr__(name, value) @property def backbone(self): - """A `keras.Model` instance providing the backbone submodel.""" + """A `keras.Model` instance providing the backbone sub-model.""" return self._backbone @backbone.setter @@ -123,7 +141,6 @@ def preprocessor(self): @preprocessor.setter def preprocessor(self, value): - self.include_preprocessing = value is not None self._preprocessor = value def get_config(self): diff --git a/keras_nlp/models/task_test.py b/keras_nlp/models/task_test.py index 09fe1b0086..bf82e4fa68 100644 --- a/keras_nlp/models/task_test.py +++ b/keras_nlp/models/task_test.py @@ -32,11 +32,11 @@ def __init__(self, **kwargs): class SimpleTask(Task): def __init__(self, preprocessor=None, activation=None, **kwargs): + self.preprocessor = preprocessor + self.activation = keras.activations.get(activation) inputs = keras.Input(shape=(5,)) outputs = keras.layers.Dense(5)(inputs) super().__init__(inputs, outputs, **kwargs) - self.preprocessor = preprocessor - self.activation = keras.activations.get(activation) class TestTask(TestCase): diff --git a/keras_nlp/models/whisper/whisper_backbone.py b/keras_nlp/models/whisper/whisper_backbone.py index 32cfab215b..2e84219091 100644 --- a/keras_nlp/models/whisper/whisper_backbone.py +++ b/keras_nlp/models/whisper/whisper_backbone.py @@ -116,75 +116,40 @@ def __init__( ): assert_tf_backend(self.__class__.__name__) - # Encoder inputs. Note that the encoder does not have a padding mask: - # https://github.com/openai/whisper/blob/v20230124/whisper/model.py#L132. - encoder_feature_input = keras.Input( - shape=(None, num_mels), dtype="float32", name="encoder_features" - ) - - # Decoder inputs. - decoder_token_id_input = keras.Input( - shape=(None,), dtype="int32", name="decoder_token_ids" - ) - decoder_padding_mask = keras.Input( - shape=(None,), dtype="int32", name="decoder_padding_mask" - ) - - # ====== Encoder ====== - - # Embed the input features. This consists of two 1D convolutional - # layers. - # For the first layer, we use `padding="same"` since that corresponds to - # a padding size of 1. - encoder_conv_layer_1 = keras.layers.Conv1D( + # === Layers === + self.encoder_conv_layer_1 = keras.layers.Conv1D( filters=hidden_dim, kernel_size=3, strides=1, padding="same", name="encoder_token_embedding_conv_layer_1", ) - embedded_features = keras.activations.gelu( - encoder_conv_layer_1(encoder_feature_input), - approximate=False, - ) - - # For the second conv. layer, we cannot use `padding="same"` since - # that corresponds to a padding size of 1.5 (since stride is 2). Hence, - # we will manually pad the input. - embedded_features = Padder()(embedded_features) - encoder_conv_layer_2 = keras.layers.Conv1D( + self.encoder_conv_layer_2 = keras.layers.Conv1D( filters=hidden_dim, kernel_size=3, strides=2, padding="valid", name="encoder_token_embedding_conv_layer_2", ) - embedded_features = keras.activations.gelu( - encoder_conv_layer_2(embedded_features), - approximate=False, + self.encoder_padder = Padder( + name="encoder_padder", ) - - # The position embedding layer for the encoder is a sinusoidal embedding - # layer: https://github.com/openai/whisper/blob/v20230124/whisper/model.py#L137. - # Hence, we set it to be non-trainable. - # TODO: We can use `keras_nlp.layers.SinePositionEncoding` layer. - position_embedding = PositionEmbedding( + self.encoder_position_embedding = PositionEmbedding( initializer=whisper_kernel_initializer(), sequence_length=max_encoder_sequence_length // 2, name="encoder_position_embedding", trainable=False, - )(embedded_features) - - # Sum and apply dropout to embeddings. - x = keras.layers.Add()((embedded_features, position_embedding)) - x = keras.layers.Dropout( + ) + self.encoder_embeddings_add = keras.layers.Add( + name="encoder_embeddings_add", + ) + self.encoder_embeddings_dropout = keras.layers.Dropout( dropout, name="encoder_embeddings_dropout", - )(x) - - # Apply successive transformer encoder blocks. + ) + self.encoder_transformer_layers = [] for i in range(num_layers): - x = WhisperEncoder( + layer = WhisperEncoder( num_heads=num_heads, intermediate_dim=intermediate_dim, activation=keras.activations.gelu, @@ -193,37 +158,29 @@ def __init__( kernel_initializer=whisper_kernel_initializer(), normalize_first=True, name=f"transformer_encoder_layer_{i}", - )(x) - - x = keras.layers.LayerNormalization( + ) + self.encoder_transformer_layers.append(layer) + self.encoder_layer_norm = keras.layers.LayerNormalization( name="encoder_layer_norm", axis=-1, epsilon=1e-5, dtype="float32", - )(x) - encoder_output = x - - # ====== Decoder ====== - - # Embed tokens and positions. - embedding_layer = TokenAndPositionEmbedding( + ) + self.decoder_embeddings = TokenAndPositionEmbedding( vocabulary_size=vocabulary_size, sequence_length=max_decoder_sequence_length, embedding_dim=hidden_dim, embeddings_initializer=whisper_kernel_initializer(), name="decoder_token_and_position_embedding", ) - x = embedding_layer(decoder_token_id_input) - - # Apply dropout to embeddings. - x = keras.layers.Dropout( + self.token_embedding = self.decoder_embeddings.token_embedding + self.decoder_embeddings_dropout = keras.layers.Dropout( dropout, name="decoder_embeddings_dropout", - )(x) - - # Apply successive transformer decoder blocks. + ) + self.decoder_transformer_layers = [] for i in range(num_layers): - transformer_decoder_layer = WhisperDecoder( + layer = WhisperDecoder( intermediate_dim=intermediate_dim, num_heads=num_heads, dropout=dropout, @@ -233,26 +190,70 @@ def __init__( normalize_first=True, name=f"transformer_decoder_layer_{i}", ) - x = transformer_decoder_layer( - decoder_sequence=x, - encoder_sequence=encoder_output, - decoder_padding_mask=decoder_padding_mask, - ) - - x = keras.layers.LayerNormalization( + self.decoder_transformer_layers.append(layer) + self.decoder_layer_norm = keras.layers.LayerNormalization( name="decoder_layer_norm", axis=-1, epsilon=1e-5, dtype="float32", - )(x) - decoder_output = x + ) - # Instantiate using Functional API Model constructor + # === Functional Model === + # Note that the encoder does not have a padding mask: + # https://github.com/openai/whisper/blob/v20230124/whisper/model.py#L132. + encoder_feature_input = keras.Input( + shape=(None, num_mels), dtype="float32", name="encoder_features" + ) + decoder_token_id_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_token_ids" + ) + decoder_padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="decoder_padding_mask" + ) + # Encoder. + # Embed the input features. This consists of two 1D convolutional + # layers. + # For the first layer, we use `padding="same"` since that corresponds to + # a padding size of 1. + embedded_features = keras.activations.gelu( + self.encoder_conv_layer_1(encoder_feature_input), + approximate=False, + ) + # For the second conv. layer, we cannot use `padding="same"` since + # that corresponds to a padding size of 1.5 (since stride is 2). Hence, + # we will manually pad the input. + embedded_features = Padder()(embedded_features) + embedded_features = keras.activations.gelu( + self.encoder_conv_layer_2(embedded_features), + approximate=False, + ) + # The position embedding layer for the encoder is a sinusoidal embedding + # layer: https://github.com/openai/whisper/blob/v20230124/whisper/model.py#L137. + # Hence, we set it to be non-trainable. + # TODO: We can use `keras_nlp.layers.SinePositionEncoding` layer. + positions = self.encoder_position_embedding(embedded_features) + x = self.encoder_embeddings_add((embedded_features, positions)) + x = self.encoder_embeddings_dropout(x) + for transformer_layer in self.encoder_transformer_layers: + x = transformer_layer(x) + x = self.encoder_layer_norm(x) + encoder_output = x + # Decoder. + x = self.decoder_embeddings(decoder_token_id_input) + x = self.decoder_embeddings_dropout(x) + for transformer_layer in self.decoder_transformer_layers: + x = transformer_layer( + decoder_sequence=x, + encoder_sequence=encoder_output, + decoder_padding_mask=decoder_padding_mask_input, + ) + x = self.decoder_layer_norm(x) + decoder_output = x super().__init__( inputs={ "encoder_features": encoder_feature_input, "decoder_token_ids": decoder_token_id_input, - "decoder_padding_mask": decoder_padding_mask, + "decoder_padding_mask": decoder_padding_mask_input, }, outputs={ "encoder_sequence_output": encoder_output, @@ -261,7 +262,7 @@ def __init__( **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads @@ -271,7 +272,6 @@ def __init__( self.dropout = dropout self.max_encoder_sequence_length = max_encoder_sequence_length self.max_decoder_sequence_length = max_decoder_sequence_length - self.token_embedding = embedding_layer def get_config(self): config = super().get_config() diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py index 67a9dd5bef..45d79eb304 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py @@ -157,30 +157,45 @@ def __init__( dropout=0.0, **kwargs, ): - inputs = backbone.input + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.pooled_dropout = keras.layers.Dropout( + dropout, + name="pooled_dropout", + ) hidden_dim = hidden_dim or backbone.hidden_dim - - x = backbone(inputs)[:, backbone.start_token_index, :] - x = keras.layers.Dropout(dropout, name="pooled_dropout")(x) - x = keras.layers.Dense( - hidden_dim, activation="tanh", name="pooled_dense" - )(x) - x = keras.layers.Dropout(dropout, name="classifier_dropout")(x) - outputs = keras.layers.Dense( + self.pooled_dense = keras.layers.Dense( + hidden_dim, + activation="tanh", + name="pooled_dense", + ) + self.output_dropout = keras.layers.Dropout( + dropout, + name="output_dropout", + ) + self.output_dense = keras.layers.Dense( num_classes, kernel_initializer=roberta_kernel_initializer(), activation=activation, name="logits", - )(x) + ) + # === Functional Model === + inputs = backbone.input + x = backbone(inputs)[:, backbone.start_token_index, :] + x = self.pooled_dropout(x) + x = self.pooled_dense(x) + x = self.output_dropout(x) + outputs = self.output_dense(x) # Instantiate using Functional API Model constructor super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line + + # === Config === self.backbone = backbone self.preprocessor = preprocessor self.num_classes = num_classes @@ -188,6 +203,7 @@ def __init__( self.hidden_dim = hidden_dim self.dropout = dropout + # === Default compilation === logit_output = self.activation == keras.activations.linear self.compile( loss=keras.losses.SparseCategoricalCrossentropy( @@ -198,9 +214,6 @@ def __init__( jit_compile=True, ) - def preprocess_samples(self, x, y=None, sample_weight=None): - return self.preprocessor(x, y=y, sample_weight=sample_weight) - def get_config(self): config = super().get_config() config.update( diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py index f0dfc85e84..b29aa30dd9 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py @@ -106,6 +106,18 @@ def __init__( preprocessor=None, **kwargs, ): + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + self.masked_lm_head = MaskedLMHead( + vocabulary_size=backbone.vocabulary_size, + token_embedding=backbone.token_embedding, + intermediate_activation="gelu", + kernel_initializer=roberta_kernel_initializer(), + name="mlm_head", + ) + + # === Functional Model === inputs = { **backbone.input, "mask_positions": keras.Input( @@ -113,25 +125,16 @@ def __init__( ), } backbone_outputs = backbone(backbone.input) - outputs = MaskedLMHead( - vocabulary_size=backbone.vocabulary_size, - token_embedding=backbone.token_embedding, - intermediate_activation="gelu", - kernel_initializer=roberta_kernel_initializer(), - name="mlm_head", - )(backbone_outputs, inputs["mask_positions"]) - - # Instantiate using Functional API Model constructor. + outputs = self.masked_lm_head( + backbone_outputs, inputs["mask_positions"] + ) super().__init__( inputs=inputs, outputs=outputs, - include_preprocessing=preprocessor is not None, **kwargs, ) - # All references to `self` below this line - self.backbone = backbone - self.preprocessor = preprocessor + # === Default compilation === self.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), optimizer=keras.optimizers.Adam(5e-5), diff --git a/keras_nlp/models/xlnet/xlnet_backbone.py b/keras_nlp/models/xlnet/xlnet_backbone.py index 1d1b4d2343..45a4b8f407 100644 --- a/keras_nlp/models/xlnet/xlnet_backbone.py +++ b/keras_nlp/models/xlnet/xlnet_backbone.py @@ -103,42 +103,25 @@ def __init__( bias_initializer="zeros", **kwargs, ): - # Inputs - token_id_input = keras.Input( - shape=(None,), dtype="int32", name="token_ids" - ) - padding_mask = keras.Input( - shape=(None,), dtype="int32", name="padding_mask" - ) - segment_ids = keras.Input( - shape=(None,), dtype="int32", name="segment_ids" - ) - - # Content and Query Embedding - word_emb, pos_emb = ContentAndQueryEmbedding( + # === Layers === + self.content_query_embedding = ContentAndQueryEmbedding( vocabulary_size=vocabulary_size, hidden_dim=hidden_dim, dropout=dropout, name="content_query_embedding", - )(token_id_input=token_id_input) - - # Apply XLNetAttentionMaskLayer and XLNetSegmentMatrixLayer Layers - # to get the processed attention masks and segment matrix. - attn_mask_content, attn_mask_query = XLNetAttentionMaskLayer( + ) + self.attn_mask_layer = XLNetAttentionMaskLayer( hidden_dim=hidden_dim, kernel_initializer_range=kernel_initializer_range, name="encoder_block_attn_mask_layer", - )(padding_mask) - seg_mat = XLNetSegmentMatrixLayer(name="encoder_block_seg_mat_layer")( - segment_ids ) - - output_content = word_emb - - # Encoders + self.seg_mat_layer = XLNetSegmentMatrixLayer( + name="encoder_block_seg_mat_layer", + ) head_dim = hidden_dim // num_heads + self.transformer_layers = [] for i in range(num_layers): - output_content, output_query = XLNetEncoder( + layer = XLNetEncoder( num_heads=num_heads, hidden_dim=hidden_dim, head_dim=head_dim, @@ -149,27 +132,52 @@ def __init__( kernel_initializer_range=kernel_initializer_range, bias_initializer=bias_initializer, name=f"xlnet_encoder_{i}", - )( + ) + self.transformer_layers.append(layer) + self.dropout = keras.layers.Dropout( + dropout, + name="dropout", + ) + + # === Functional Model === + token_id_input = keras.Input( + shape=(None,), dtype="int32", name="token_ids" + ) + padding_mask_input = keras.Input( + shape=(None,), dtype="int32", name="padding_mask" + ) + segment_id_input = keras.Input( + shape=(None,), dtype="int32", name="segment_ids" + ) + # Content and Query Embedding + word_emb, pos_emb = self.content_query_embedding(token_id_input) + # Apply XLNetAttentionMaskLayer and XLNetSegmentMatrixLayer Layers + # to get the processed attention masks and segment matrix. + attn_mask_content, attn_mask_query = self.attn_mask_layer( + padding_mask_input + ) + seg_mat = self.seg_mat_layer(segment_id_input) + output_content = word_emb + for transformer_layer in self.transformer_layers: + output_content, output_query = transformer_layer( output_content=output_content, attn_mask_content=attn_mask_content, attn_mask_query=attn_mask_query, pos_emb=pos_emb, seg_mat=seg_mat, ) - - output = keras.layers.Dropout(dropout)(output_content) - + output = self.dropout(output_content) super().__init__( inputs={ "token_ids": token_id_input, - "padding_mask": padding_mask, - "segment_ids": segment_ids, + "padding_mask": padding_mask_input, + "segment_ids": segment_id_input, }, outputs=output, **kwargs, ) - # All references to `self` below this line + # === Config === self.vocabulary_size = vocabulary_size self.num_layers = num_layers self.num_heads = num_heads diff --git a/keras_nlp/utils/pipeline_model.py b/keras_nlp/utils/pipeline_model.py index fa08aaf929..89a2f81822 100644 --- a/keras_nlp/utils/pipeline_model.py +++ b/keras_nlp/utils/pipeline_model.py @@ -142,27 +142,15 @@ def _split(t, start, end): class PipelineModel(keras.Model): """A model which allows automatically applying preprocessing.""" - def __init__(self, *args, include_preprocessing=True, **kwargs): + def __init__(self, *args, **kwargs): # Workaround for https://github.com/keras-team/keras/issues/17270 # Reset any attempt to overwrite this classes base class to this class # can continue to be used for functional and non-functional models. PipelineModel.__bases__ = (keras.Model,) super().__init__(*args, **kwargs) - self.include_preprocessing = include_preprocessing - - def preprocess_features(self, x): - """An overridable function which preprocesses features.""" - return x - - def preprocess_labels(self, y): - """An overridable function which preprocesses labels.""" - return y def preprocess_samples(self, x, y=None, sample_weight=None): """An overridable function which preprocesses entire samples.""" - x = self.preprocess_features(x) - if y is not None: - y = self.preprocess_labels(y) return pack_x_y_sample_weight(x, y, sample_weight) # ======================================================================== @@ -184,10 +172,9 @@ def fit( ) x = _convert_inputs_to_dataset(x, y, sample_weight, batch_size) - if self.include_preprocessing: - x = x.map( - self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE - ).prefetch(tf.data.AUTOTUNE) + x = x.map( + self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE + ).prefetch(tf.data.AUTOTUNE) if validation_data is not None: if not isinstance(validation_data, tf.data.Dataset): @@ -221,10 +208,9 @@ def evaluate( # needs preprocessing. kwargs.pop("_use_cached_eval_dataset", None) x = _convert_inputs_to_dataset(x, y, sample_weight, batch_size) - if self.include_preprocessing: - x = x.map( - self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE - ).prefetch(tf.data.AUTOTUNE) + x = x.map( + self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE + ).prefetch(tf.data.AUTOTUNE) return super().evaluate( x=x, y=None, @@ -239,11 +225,9 @@ def predict( **kwargs, ): x = _convert_inputs_to_dataset(x, None, None, batch_size) - if self.include_preprocessing: - x = x.map( - self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE - ).prefetch(tf.data.AUTOTUNE) - + x = x.map( + self.preprocess_samples, num_parallel_calls=tf.data.AUTOTUNE + ).prefetch(tf.data.AUTOTUNE) return super().predict( x=x, batch_size=None, @@ -257,14 +241,13 @@ def train_on_batch( sample_weight=None, **kwargs, ): - if self.include_preprocessing: - data = self.preprocess_samples(x, y, sample_weight) - x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) - x = ops.convert_to_tensor(x) - if y is not None: - y = ops.convert_to_tensor(y) - if sample_weight is not None: - sample_weight = ops.convert_to_tensor(sample_weight) + data = self.preprocess_samples(x, y, sample_weight) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) + x = ops.convert_to_tensor(x) + if y is not None: + y = ops.convert_to_tensor(y) + if sample_weight is not None: + sample_weight = ops.convert_to_tensor(sample_weight) return super().train_on_batch( x=x, y=y, @@ -279,14 +262,13 @@ def test_on_batch( sample_weight=None, **kwargs, ): - if self.include_preprocessing: - data = self.preprocess_samples(x, y, sample_weight) - x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) - x = ops.convert_to_tensor(x) - if y is not None: - y = ops.convert_to_tensor(y) - if sample_weight is not None: - sample_weight = ops.convert_to_tensor(sample_weight) + data = self.preprocess_samples(x, y, sample_weight) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) + x = ops.convert_to_tensor(x) + if y is not None: + y = ops.convert_to_tensor(y) + if sample_weight is not None: + sample_weight = ops.convert_to_tensor(sample_weight) return super().test_on_batch( x=x, y=y, @@ -299,10 +281,9 @@ def predict_on_batch( x, **kwargs, ): - if self.include_preprocessing: - data = self.preprocess_samples(x) - x, _, _ = keras.utils.unpack_x_y_sample_weight(data) - x = ops.convert_to_tensor(x) + data = self.preprocess_samples(x) + x, _, _ = keras.utils.unpack_x_y_sample_weight(data) + x = ops.convert_to_tensor(x) return super().predict_on_batch( x=x, **kwargs, diff --git a/keras_nlp/utils/pipeline_model_test.py b/keras_nlp/utils/pipeline_model_test.py index 4c7c7f1964..ae71f9f570 100644 --- a/keras_nlp/utils/pipeline_model_test.py +++ b/keras_nlp/utils/pipeline_model_test.py @@ -36,8 +36,9 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.dense = keras.layers.Dense(1) - def preprocess_features(self, x): - return tf.strings.to_number(x) + def preprocess_samples(self, x, y=None, sample_weight=None): + x = tf.strings.to_number(x) + return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) def call(self, inputs): return self.dense(inputs) @@ -48,8 +49,10 @@ def __init__(self, **kwargs): super().__init__(**kwargs) self.dense = keras.layers.Dense(1) - def preprocess_labels(self, y): - return tf.strings.to_number(y) + def preprocess_samples(self, x, y=None, sample_weight=None): + if y is not None: + y = tf.strings.to_number(y) + return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) def call(self, inputs): return self.dense(inputs) @@ -63,8 +66,7 @@ def __init__(self, **kwargs): self.dense = keras.layers.Dense(1) def preprocess_samples(self, x, y=None, sample_weight=None): - x = tf.strings.to_number(x) - y = x + y = x = tf.strings.to_number(x) return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) def call(self, inputs): @@ -77,8 +79,9 @@ def __init__(self, **kwargs): outputs = keras.layers.Dense(1)(inputs) super().__init__(inputs, outputs, **kwargs) - def preprocess_features(self, x): - return tf.strings.to_number(x) + def preprocess_samples(self, x, y=None, sample_weight=None): + x = tf.strings.to_number(x) + return keras.utils.pack_x_y_sample_weight(x, y, sample_weight) def get_config(self): return {} @@ -167,19 +170,6 @@ def test_fit_with_preprocessing(self): model.fit(x=x, y=y, batch_size=8) model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_fit_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - y = np.random.uniform(size=(100, 1)) - sw = np.random.uniform(size=(100, 1)) - model = FeaturePipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.fit(x=x, y=y, sample_weight=sw, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y, sw)).batch(8)) - # Without sample weight. - model.fit(x=x, y=y, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_evaluate_with_preprocessing(self): x = tf.strings.as_string(np.random.uniform(size=(100, 5))) y = np.random.uniform(size=(100, 1)) @@ -193,19 +183,6 @@ def test_evaluate_with_preprocessing(self): model.evaluate(x=x, y=y, batch_size=8) model.evaluate(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_evaluate_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - y = np.random.uniform(size=(100, 1)) - sw = np.random.uniform(size=(100, 1)) - model = FeaturePipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.evaluate(x=x, y=y, sample_weight=sw, batch_size=8) - model.evaluate(tf.data.Dataset.from_tensor_slices((x, y, sw)).batch(8)) - # Without sample weight. - model.evaluate(x=x, y=y, batch_size=8) - model.evaluate(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_predict_with_preprocessing(self): x = tf.strings.as_string(np.random.uniform(size=(100, 5))) model = FeaturePipeline() @@ -213,13 +190,6 @@ def test_predict_with_preprocessing(self): model.predict(x=x, batch_size=8) model.predict(tf.data.Dataset.from_tensor_slices(x).batch(8)) - def test_predict_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - model = FeaturePipeline(include_preprocessing=False) - model.compile(loss="mse") - model.predict(x=x, batch_size=8) - model.predict(tf.data.Dataset.from_tensor_slices(x).batch(8)) - def test_on_batch(self): x = tf.strings.as_string(np.random.uniform(size=(8, 5))) y = np.random.uniform(size=(8, 1)) @@ -234,19 +204,6 @@ def test_on_batch(self): model.test_on_batch(x=x, y=y) model.predict_on_batch(x=x) - def test_on_batch_no_preprocessing(self): - x = np.random.uniform(size=(8, 5)) - y = np.random.uniform(size=(8, 1)) - sw = np.random.uniform(size=(8, 1)) - model = FeaturePipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.train_on_batch(x=x, y=y, sample_weight=sw) - model.test_on_batch(x=x, y=y, sample_weight=sw) - # Without sample weight. - model.train_on_batch(x=x, y=y) - model.test_on_batch(x=x, y=y) - def test_saved_model(self): model = FeaturePipeline() x = tf.strings.as_string(np.random.uniform(size=(8, 5))) @@ -278,19 +235,6 @@ def test_fit_with_preprocessing(self): model.fit(x=x, y=y, batch_size=8) model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_fit_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - y = np.random.uniform(size=(100, 1)) - sw = np.random.uniform(size=(100, 1)) - model = LabelPipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.fit(x=x, y=y, sample_weight=sw, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y, sw)).batch(8)) - # Without sample weight. - model.fit(x=x, y=y, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_evaluate_with_preprocessing(self): x = np.random.uniform(size=(100, 5)) y = tf.strings.as_string(np.random.uniform(size=(100, 1))) @@ -304,19 +248,6 @@ def test_evaluate_with_preprocessing(self): model.evaluate(x=x, y=y, batch_size=8) model.evaluate(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_evaluate_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - y = np.random.uniform(size=(100, 1)) - sw = np.random.uniform(size=(100, 1)) - model = LabelPipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.evaluate(x=x, y=y, sample_weight=sw, batch_size=8) - model.evaluate(tf.data.Dataset.from_tensor_slices((x, y, sw)).batch(8)) - # Without sample weight. - model.evaluate(x=x, y=y, batch_size=8) - model.evaluate(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_predict_with_preprocessing(self): x = np.random.uniform(size=(100, 5)) model = LabelPipeline() @@ -338,20 +269,6 @@ def test_on_batch(self): model.test_on_batch(x=x, y=y) model.predict_on_batch(x=x) - def test_on_batch_no_preprocessing(self): - x = np.random.uniform(size=(8, 5)) - y = np.random.uniform(size=(8, 1)) - sw = np.random.uniform(size=(8, 1)) - model = LabelPipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.train_on_batch(x=x, y=y, sample_weight=sw) - model.test_on_batch(x=x, y=y, sample_weight=sw) - # Without sample weight. - model.train_on_batch(x=x, y=y) - model.test_on_batch(x=x, y=y) - model.predict_on_batch(x=x) - def test_saved_model(self): model = LabelPipeline() x = np.random.uniform(size=(8, 5)) @@ -377,14 +294,6 @@ def test_fit_with_preprocessing(self): model.fit(x=data, batch_size=8) model.fit(tf.data.Dataset.from_tensor_slices(data).batch(8)) - def test_fit_no_preprocessing(self): - x = np.random.uniform(size=(100, 1)) - y = np.random.uniform(size=(100, 1)) - model = DataPipeline(include_preprocessing=False) - model.compile(loss="mse") - model.fit(x=x, y=y, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_evaluate_with_preprocessing(self): data = tf.strings.as_string(np.random.uniform(size=(100, 1))) model = DataPipeline() @@ -392,14 +301,6 @@ def test_evaluate_with_preprocessing(self): model.evaluate(x=data, batch_size=8) model.evaluate(tf.data.Dataset.from_tensor_slices(data).batch(8)) - def test_evaluate_no_preprocessing(self): - x = np.random.uniform(size=(100, 1)) - y = np.random.uniform(size=(100, 1)) - model = DataPipeline(include_preprocessing=False) - model.compile(loss="mse") - model.evaluate(x=x, y=y, batch_size=8) - model.evaluate(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_predict_with_preprocessing(self): x = tf.strings.as_string(np.random.uniform(size=(100, 1))) model = DataPipeline() @@ -407,13 +308,6 @@ def test_predict_with_preprocessing(self): model.predict(x=x, batch_size=8) model.predict(tf.data.Dataset.from_tensor_slices(x).batch(8)) - def test_predict_no_preprocessing(self): - x = np.random.uniform(size=(100, 1)) - model = DataPipeline(include_preprocessing=False) - model.compile(loss="mse") - model.predict(x=x, batch_size=8) - model.predict(tf.data.Dataset.from_tensor_slices(x).batch(8)) - def test_on_batch(self): data = tf.strings.as_string(np.random.uniform(size=(8, 1))) model = DataPipeline() @@ -426,20 +320,6 @@ def test_on_batch(self): model.test_on_batch(x=data) model.predict_on_batch(x=data) - def test_on_batch_no_preprocessing(self): - x = np.random.uniform(size=(8, 1)) - y = np.random.uniform(size=(8, 1)) - sw = np.random.uniform(size=(8, 1)) - model = DataPipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.train_on_batch(x=x, y=y, sample_weight=sw) - model.test_on_batch(x=x, y=y, sample_weight=sw) - # Without sample weight. - model.train_on_batch(x=x, y=y) - model.test_on_batch(x=x, y=y) - model.predict_on_batch(x=x) - def test_saved_model(self): model = DataPipeline() data = tf.strings.as_string(np.random.uniform(size=(8, 1))) @@ -472,19 +352,6 @@ def test_fit(self): model.fit(x=x, y=y, batch_size=8) model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_fit_no_preprocessing(self): - x = np.random.uniform(size=(100, 5)) - y = np.random.uniform(size=(100, 1)) - sw = np.random.uniform(size=(100, 1)) - model = FunctionalPipeline(include_preprocessing=False) - model.compile(loss="mse") - # With sample weight. - model.fit(x=x, y=y, sample_weight=sw, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y, sw)).batch(8)) - # Without sample weight. - model.fit(x=x, y=y, batch_size=8) - model.fit(tf.data.Dataset.from_tensor_slices((x, y)).batch(8)) - def test_saved_model(self): model = FunctionalPipeline() x = tf.strings.as_string(np.random.uniform(size=(8, 5)))