From 301f96c351da23d0778ebc7a141e01603ff9b9fd Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Thu, 22 Sep 2022 10:03:08 +0530 Subject: [PATCH 01/11] chore: initial commit --- .../models/levit/modeling_tf_levit.py | 822 ++++++++++++++++++ 1 file changed, 822 insertions(+) create mode 100644 src/transformers/models/levit/modeling_tf_levit.py diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py new file mode 100644 index 000000000000..02cffe532d52 --- /dev/null +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -0,0 +1,822 @@ +# coding=utf-8 +# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TensorFlow LeViT model.""" + +import itertools +from dataclasses import dataclass +from typing import Optional, Tuple + +import tensorflow as tf +from tensorflow.keras import backend as K + +from ...modeling_outputs import ModelOutput +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithNoAttention, + TFBaseModelOutputWithPoolingAndNoAttention, + TFImageClassifierOutputWithNoAttention, +) +from ...modeling_tf_utils import TFPreTrainedModel +from ...tf_utils import shape_list, stable_softmax +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_levit import LevitConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "LevitConfig" +_FEAT_EXTRACTOR_FOR_DOC = "LevitFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "facebook/levit-128S" +_EXPECTED_OUTPUT_SHAPE = [1, 16, 384] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "facebook/levit-128S" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + +LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/levit-128S", + # See all LeViT models at https://huggingface.co/models?filter=levit +] + + +@dataclass +class TFLevitForImageClassificationWithTeacherOutput(ModelOutput): + """ + Output type of [`LevitForImageClassificationWithTeacher`]. + + Args: + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores as the average of the `cls_logits` and `distillation_logits`. + cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the + class token). + distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the + distillation token). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + """ + + logits: tf.Tensor = None + cls_logits: tf.Tensor = None + distillation_logits: tf.Tensor = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + + +class TFLevitConvEmbeddings(tf.keras.layers.Layer): + """ + LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + groups=1, + bn_weight_init=1, + **kwargs, + ): + super().__init__(**kwargs) + self.convolution = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=kernel_size, + strides=stride, + padding=padding, + dilation_rate=dilation, + groups=groups, + bias=False, + data_format="channels_first", # required for tf + name="convolution", + ) + # The epsilon and momentum used here are the defaults in torch batch norm layer. + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + + def call(self, embeddings): + embeddings = self.convolution(embeddings) + embeddings = self.batch_norm(embeddings) + return embeddings + + +# Defining hard swish with keras backend. +def hard_swish(x): + return x * (K.relu(x + 3.0, max_value=6.0) / 6.0) + + +class TFLevitPatchEmbeddings(tf.keras.layers.Layer): + """ + LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple + `TFLevitConvEmbeddings`. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.embedding_layer_1 = TFLevitConvEmbeddings( + config.num_channels, + config.hidden_sizes[0] // 8, + config.kernel_size, + config.stride, + config.padding, + name="embedding_layer_1", + ) + self.activation_layer_1 = hard_swish + + self.embedding_layer_2 = TFLevitConvEmbeddings( + config.hidden_sizes[0] // 8, + config.hidden_sizes[0] // 4, + config.kernel_size, + config.stride, + config.padding, + name="embedding_layer_2", + ) + self.activation_layer_2 = hard_swish + + self.embedding_layer_3 = TFLevitConvEmbeddings( + config.hidden_sizes[0] // 4, + config.hidden_sizes[0] // 2, + config.kernel_size, + config.stride, + config.padding, + name="embedding_layer_3", + ) + self.activation_layer_3 = hard_swish + + self.embedding_layer_4 = TFLevitConvEmbeddings( + config.hidden_sizes[0] // 2, + config.hidden_sizes[0], + config.kernel_size, + config.stride, + config.padding, + name="embedding_layer_4", + ) + self.num_channels = config.num_channels + + def call(self, pixel_values): + batch_size = tf.shape(pixel_values)[0] + num_channels = tf.shape(pixel_values)[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.embedding_layer_1(pixel_values) + embeddings = self.activation_layer_1(embeddings) + embeddings = self.embedding_layer_2(embeddings) + embeddings = self.activation_layer_2(embeddings) + embeddings = self.embedding_layer_3(embeddings) + embeddings = self.activation_layer_3(embeddings) + embeddings = self.embedding_layer_4(embeddings) + # Flatten the embeddings + flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1)) + # Transpose the channel and spatial axis of the flattened embeddings + transpose_embeddings = tf.transpose(flattended_embeddings, perm=(0, 2, 1)) + return transpose_embeddings + + +class TFMLPLayerWithBN(tf.keras.layers.Layer): + def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs): + super().__init__(**kwargs) + self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear") + # The epsilon and momentum used here are the defaults in torch batch norm layer. + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + + def call(self, hidden_state): + num_channels = tf.shape(hidden_state)[2] + hidden_state = self.linear(hidden_state) + # Before sending the hidden state to the batch normalization layer, we would have to + # flatten the hidden states in the batch and seq len dimension + flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels)) + batch_norm_hidden_state = self.batch_norm(flattened_hidden_state) + # Reshape the output of batch norm to have the same shape as the original hidden state + hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state)) + return hidden_state + + +class TFLevitSubsample(tf.keras.layers.Layer): + def __init__(self, stride, resolution, **kwargs): + super().__init__() + self.stride = stride + self.resolution = resolution + + def call(self, hidden_state): + batch_size = tf.shape(hidden_state)[0] + channels = tf.shape(hidden_state)[2] + reshaped_hidden_state = tf.reshape( + hidden_state, shape=(batch_size, self.resolution, self.resolution, channels) + ) + strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride] + hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels)) + return hidden_state + + +class TFLevitAttention(tf.keras.layers.Layer): + def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, **kwargs): + super().__init__(**kwargs) + self.num_attention_heads = num_attention_heads + self.scale = key_dim**-0.5 + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads * 2 + self.out_dim_projection = attention_ratio * key_dim * num_attention_heads + + self.queries_keys_values = TFMLPLayerWithBN(hidden_sizes, self.out_dim_keys_values, name="queries_keys_values") + self.activation = hard_swish + self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection") + + points = list(itertools.product(range(resolution), range(resolution))) + len_points = len(points) + attention_offsets, indices = {}, [] + for p1 in points: + for p2 in points: + offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + indices.append(attention_offsets[offset]) + self.attention_offsets = attention_offsets + self.indices = indices + self.attention_bias_cache = {} + + def build(self, input_shape): + self.attention_biases = self.add_weight( + shape=(self.num_attention_heads, len(self.attention_offsets)), + initializer="zeros", + trainable=True, + name="attention_biases", + ) + super().build(input_shape) + + # Todo: @ariG23498 + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and self.attention_bias_cache: + self.attention_bias_cache = {} # clear ab cache + + def get_attention_biases(self, device, attention_bias_idxs, training=None): + if training: + return self.attention_biases[:, attention_bias_idxs] + else: + device_key = str(device) + if device_key not in self.attention_bias_cache: + self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] + return self.attention_bias_cache[device_key] + + def call(self, hidden_state, attention_bias_idxs, training=None): + batch_size = tf.shape(hidden_state)[0] + seq_length = tf.shape(hidden_state)[1] + queries_keys_values = self.queries_keys_values(hidden_state) + + # Reshape queries_keys_values + reshaped_queries_keys_values = tf.reshape( + queries_keys_values, shape=(batch_size, seq_length, self.num_attention_heads, -1) + ) + query, key, value = tf.split( + value=reshaped_queries_keys_values, + num_or_size_splits=[self.key_dim, self.key_dim, self.attention_ratio * self.key_dim], + axis=3, + ) + query = tf.transpose(query, perm=(0, 2, 1, 3)) + key = tf.transpose(key, perm=(0, 2, 1, 3)) + value = tf.transpose(value, perm=(0, 2, 1, 3)) + + attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases( + hidden_state.device, attention_bias_idxs, training=training + ) + attention = stable_softmax(attention, axis=-1) + hidden_state = tf.matmul(attention, value) + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3)) + hidden_state = tf.reshape(hidden_state, shape=(batch_size, seq_length, self.out_dim_projection)) + hidden_state = self.projection(self.activation(hidden_state)) + return hidden_state + + +class LevitAttentionSubsample(nn.Module): + def __init__( + self, + input_dim, + output_dim, + key_dim, + num_attention_heads, + attention_ratio, + stride, + resolution_in, + resolution_out, + ): + super().__init__() + self.num_attention_heads = num_attention_heads + self.scale = key_dim**-0.5 + self.key_dim = key_dim + self.attention_ratio = attention_ratio + self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads + self.out_dim_projection = attention_ratio * key_dim * num_attention_heads + self.resolution_out = resolution_out + # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling + self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values) + self.queries_subsample = LevitSubsample(stride, resolution_in) + self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads) + self.activation = nn.Hardswish() + self.projection = MLPLayerWithBN(self.out_dim_projection, output_dim) + + self.attention_bias_cache = {} + + points = list(itertools.product(range(resolution_in), range(resolution_in))) + points_ = list(itertools.product(range(resolution_out), range(resolution_out))) + len_points, len_points_ = len(points), len(points_) + attention_offsets, indices = {}, [] + for p1 in points_: + for p2 in points: + size = 1 + offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2), abs(p1[1] * stride - p2[1] + (size - 1) / 2)) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + indices.append(attention_offsets[offset]) + + self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) + self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points)) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and self.attention_bias_cache: + self.attention_bias_cache = {} # clear ab cache + + def get_attention_biases(self, device): + if self.training: + return self.attention_biases[:, self.attention_bias_idxs] + else: + device_key = str(device) + if device_key not in self.attention_bias_cache: + self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] + return self.attention_bias_cache[device_key] + + def forward(self, hidden_state): + batch_size, seq_length, _ = hidden_state.shape + key, value = ( + self.keys_values(hidden_state) + .view(batch_size, seq_length, self.num_attention_heads, -1) + .split([self.key_dim, self.attention_ratio * self.key_dim], dim=3) + ) + key = key.permute(0, 2, 1, 3) + value = value.permute(0, 2, 1, 3) + + query = self.queries(self.queries_subsample(hidden_state)) + query = query.view(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim).permute( + 0, 2, 1, 3 + ) + + attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device) + attention = attention.softmax(dim=-1) + hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection) + hidden_state = self.projection(self.activation(hidden_state)) + return hidden_state + + +class LevitMLPLayer(nn.Module): + """ + MLP Layer with `2X` expansion in contrast to ViT with `4X`. + """ + + def __init__(self, input_dim, hidden_dim): + super().__init__() + self.linear_up = MLPLayerWithBN(input_dim, hidden_dim) + self.activation = nn.Hardswish() + self.linear_down = MLPLayerWithBN(hidden_dim, input_dim) + + def forward(self, hidden_state): + hidden_state = self.linear_up(hidden_state) + hidden_state = self.activation(hidden_state) + hidden_state = self.linear_down(hidden_state) + return hidden_state + + +class LevitResidualLayer(nn.Module): + """ + Residual Block for LeViT + """ + + def __init__(self, module, drop_rate): + super().__init__() + self.module = module + self.drop_rate = drop_rate + + def forward(self, hidden_state): + if self.training and self.drop_rate > 0: + rnd = torch.rand(hidden_state.size(0), 1, 1, device=hidden_state.device) + rnd = rnd.ge_(self.drop_rate).div(1 - self.drop_rate).detach() + hidden_state = hidden_state + self.module(hidden_state) * rnd + return hidden_state + else: + hidden_state = hidden_state + self.module(hidden_state) + return hidden_state + + +class LevitStage(nn.Module): + """ + LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers. + """ + + def __init__( + self, + config, + idx, + hidden_sizes, + key_dim, + depths, + num_attention_heads, + attention_ratio, + mlp_ratio, + down_ops, + resolution_in, + ): + super().__init__() + self.layers = [] + self.config = config + self.resolution_in = resolution_in + # resolution_in is the intial resolution, resolution_out is final resolution after downsampling + for _ in range(depths): + self.layers.append( + LevitResidualLayer( + LevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), + self.config.drop_path_rate, + ) + ) + if mlp_ratio > 0: + hidden_dim = hidden_sizes * mlp_ratio + self.layers.append( + LevitResidualLayer(LevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate) + ) + + if down_ops[0] == "Subsample": + self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1 + self.layers.append( + LevitAttentionSubsample( + *self.config.hidden_sizes[idx : idx + 2], + key_dim=down_ops[1], + num_attention_heads=down_ops[2], + attention_ratio=down_ops[3], + stride=down_ops[5], + resolution_in=resolution_in, + resolution_out=self.resolution_out, + ) + ) + self.resolution_in = self.resolution_out + if down_ops[4] > 0: + hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4] + self.layers.append( + LevitResidualLayer( + LevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate + ) + ) + + self.layers = nn.ModuleList(self.layers) + + def get_resolution(self): + return self.resolution_in + + def forward(self, hidden_state): + for layer in self.layers: + hidden_state = layer(hidden_state) + return hidden_state + + +class LevitEncoder(nn.Module): + """ + LeViT Encoder consisting of multiple `LevitStage` stages. + """ + + def __init__(self, config): + super().__init__() + self.config = config + resolution = self.config.image_size // self.config.patch_size + self.stages = [] + self.config.down_ops.append([""]) + + for stage_idx in range(len(config.depths)): + stage = LevitStage( + config, + stage_idx, + config.hidden_sizes[stage_idx], + config.key_dim[stage_idx], + config.depths[stage_idx], + config.num_attention_heads[stage_idx], + config.attention_ratio[stage_idx], + config.mlp_ratio[stage_idx], + config.down_ops[stage_idx], + resolution, + ) + resolution = stage.get_resolution() + self.stages.append(stage) + + self.stages = nn.ModuleList(self.stages) + + def forward(self, hidden_state, output_hidden_states=False, return_dict=True): + all_hidden_states = () if output_hidden_states else None + + for stage in self.stages: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + hidden_state = stage(hidden_state) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + if not return_dict: + return tuple(v for v in [hidden_state, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states) + + +class LevitClassificationLayer(nn.Module): + """ + LeViT Classification Layer + """ + + def __init__(self, input_dim, output_dim): + super().__init__() + self.batch_norm = nn.BatchNorm1d(input_dim) + self.linear = nn.Linear(input_dim, output_dim) + + def forward(self, hidden_state): + hidden_state = self.batch_norm(hidden_state) + logits = self.linear(hidden_state) + return logits + + +class LevitPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = LevitConfig + base_model_prefix = "levit" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, LevitModel): + module.gradient_checkpointing = value + + +LEVIT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`LevitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +LEVIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See + [`AutoFeatureExtractor.__call__`] for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Levit model outputting raw features without any specific head on top.", + LEVIT_START_DOCSTRING, +) +class LevitModel(LevitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.patch_embeddings = LevitPatchEmbeddings(config) + self.encoder = LevitEncoder(config) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndNoAttention, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: torch.FloatTensor = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embeddings = self.patch_embeddings(pixel_values) + encoder_outputs = self.encoder( + embeddings, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) + pooled_output = last_hidden_state.mean(dim=1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + LEVIT_START_DOCSTRING, +) +class LevitForImageClassification(LevitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.levit = LevitModel(config) + + # Classifier head + self.classifier = ( + LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + if config.num_labels > 0 + else torch.nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: torch.FloatTensor = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + sequence_output = outputs[0] + sequence_output = sequence_output.mean(1) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutputWithNoAttention( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and + a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning:: + This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet + supported. + """, + LEVIT_START_DOCSTRING, +) +class LevitForImageClassificationWithTeacher(LevitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.levit = LevitModel(config) + + # Classifier head + self.classifier = ( + LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + if config.num_labels > 0 + else torch.nn.Identity() + ) + self.classifier_distill = ( + LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + if config.num_labels > 0 + else torch.nn.Identity() + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=LevitForImageClassificationWithTeacherOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: torch.FloatTensor = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + sequence_output = outputs[0] + sequence_output = sequence_output.mean(1) + cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output) + logits = (cls_logits + distill_logits) / 2 + + if not return_dict: + output = (logits, cls_logits, distill_logits) + outputs[2:] + return output + + return LevitForImageClassificationWithTeacherOutput( + logits=logits, + cls_logits=cls_logits, + distillation_logits=distill_logits, + hidden_states=outputs.hidden_states, + ) From 961a638029cc48626a2a3e4e9640f7e47d75acb2 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Thu, 6 Oct 2022 22:23:37 +0530 Subject: [PATCH 02/11] chore: porting layers into TF --- .../models/levit/modeling_tf_levit.py | 446 +++++++++++------- 1 file changed, 276 insertions(+), 170 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 02cffe532d52..6b1de7a42815 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -16,9 +16,10 @@ import itertools from dataclasses import dataclass -from typing import Optional, Tuple +from typing import Optional, Tuple, Dict import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy from tensorflow.keras import backend as K from ...modeling_outputs import ModelOutput @@ -27,7 +28,7 @@ TFBaseModelOutputWithPoolingAndNoAttention, TFImageClassifierOutputWithNoAttention, ) -from ...modeling_tf_utils import TFPreTrainedModel +from ...modeling_tf_utils import TFPreTrainedModel, keras_serializable, unpack_inputs from ...tf_utils import shape_list, stable_softmax from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging from .configuration_levit import LevitConfig @@ -68,7 +69,7 @@ class token). Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the distillation token). hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. """ @@ -263,7 +264,7 @@ def build(self, input_shape): ) super().build(input_shape) - # Todo: @ariG23498 + # TODO @ariG23498 @torch.no_grad() def train(self, mode=True): super().train(mode) @@ -308,7 +309,7 @@ def call(self, hidden_state, attention_bias_idxs, training=None): return hidden_state -class LevitAttentionSubsample(nn.Module): +class TFLevitAttentionSubsample(tf.keras.layers.Layer): def __init__( self, input_dim, @@ -319,8 +320,9 @@ def __init__( stride, resolution_in, resolution_out, + **kwargs, ): - super().__init__() + super().__init__(**kwargs) self.num_attention_heads = num_attention_heads self.scale = key_dim**-0.5 self.key_dim = key_dim @@ -329,11 +331,11 @@ def __init__( self.out_dim_projection = attention_ratio * key_dim * num_attention_heads self.resolution_out = resolution_out # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling - self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values) - self.queries_subsample = LevitSubsample(stride, resolution_in) - self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads) - self.activation = nn.Hardswish() - self.projection = MLPLayerWithBN(self.out_dim_projection, output_dim) + self.keys_values = TFMLPLayerWithBN(input_dim, self.out_dim_keys_values, name="keys_values") + self.queries_subsample = TFLevitSubsample(stride, resolution_in, name="queries_subsample") + self.queries = TFMLPLayerWithBN(input_dim, key_dim * num_attention_heads, name="queries") + self.activation = hard_swish + self.projection = TFMLPLayerWithBN(self.out_dim_projection, output_dim, name="projection") self.attention_bias_cache = {} @@ -349,78 +351,97 @@ def __init__( attention_offsets[offset] = len(attention_offsets) indices.append(attention_offsets[offset]) - self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) - self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points)) + self.attention_offsets = attention_offsets + + def build(self, input_shape): + self.attention_biases = self.add_weight( + shape=(self.num_attention_heads, len(self.attention_offsets)), + initializer="zeros", + trainable=True, + name="attention_biases", + ) + super().build(input_shape) + # TODO @ariG23498 @torch.no_grad() def train(self, mode=True): super().train(mode) if mode and self.attention_bias_cache: self.attention_bias_cache = {} # clear ab cache - def get_attention_biases(self, device): - if self.training: - return self.attention_biases[:, self.attention_bias_idxs] + def get_attention_biases(self, device, attention_bias_idxs, training=None): + if training: + return self.attention_biases[:, attention_bias_idxs] else: device_key = str(device) if device_key not in self.attention_bias_cache: self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] return self.attention_bias_cache[device_key] - def forward(self, hidden_state): - batch_size, seq_length, _ = hidden_state.shape - key, value = ( - self.keys_values(hidden_state) - .view(batch_size, seq_length, self.num_attention_heads, -1) - .split([self.key_dim, self.attention_ratio * self.key_dim], dim=3) + def call(self, hidden_state, attention_bias_idxs, training=None): + batch_size = tf.shape(hidden_state)[0] + seq_length = tf.shape(hidden_state)[1] + + # Process the hidden states and reshape it + reshaped_hidden_state = tf.reshape( + self.keys_values(hidden_state), + shape=(batch_size, seq_length, self.num_attention_heads, -1) + ) + # Split the reshaped hidden state into key and value + key, value = tf.split( + reshaped_hidden_state, + num_or_size_splits=[self.key_dim, self.attention_ratio * self.key_dim], + axis=3, ) - key = key.permute(0, 2, 1, 3) - value = value.permute(0, 2, 1, 3) + key = tf.transpose(key, perm=(0, 2, 1, 3)) + value = tf.transpose(value, perm=(0, 2, 1, 3)) query = self.queries(self.queries_subsample(hidden_state)) - query = query.view(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim).permute( - 0, 2, 1, 3 - ) + query = tf.reshape(query, shape=(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim)) + query = tf.transpose(query, perm=(0, 2, 1, 3)) - attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device) + attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases( + hidden_state.device, attention_bias_idxs, training=training + ) attention = attention.softmax(dim=-1) hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection) hidden_state = self.projection(self.activation(hidden_state)) return hidden_state -class LevitMLPLayer(nn.Module): +class TFLevitMLPLayer(tf.keras.layers.Layer): """ MLP Layer with `2X` expansion in contrast to ViT with `4X`. """ - def __init__(self, input_dim, hidden_dim): - super().__init__() - self.linear_up = MLPLayerWithBN(input_dim, hidden_dim) - self.activation = nn.Hardswish() - self.linear_down = MLPLayerWithBN(hidden_dim, input_dim) + def __init__(self, input_dim, hidden_dim, **kwargs): + super().__init__(**kwargs) + self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim) + self.activation = hard_swish + self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim) - def forward(self, hidden_state): + def call(self, hidden_state): hidden_state = self.linear_up(hidden_state) hidden_state = self.activation(hidden_state) hidden_state = self.linear_down(hidden_state) return hidden_state -class LevitResidualLayer(nn.Module): +class TFLevitResidualLayer(tf.keras.layers.Layer): """ - Residual Block for LeViT + Residual Block for TFLeViT """ - def __init__(self, module, drop_rate): - super().__init__() + def __init__(self, module, drop_rate, **kwargs): + super().__init__(**kwargs) self.module = module self.drop_rate = drop_rate - def forward(self, hidden_state): - if self.training and self.drop_rate > 0: - rnd = torch.rand(hidden_state.size(0), 1, 1, device=hidden_state.device) - rnd = rnd.ge_(self.drop_rate).div(1 - self.drop_rate).detach() + def call(self, hidden_state, training=None): + if training and self.drop_rate > 0: + rnd = tf.random.normal(shape=(tf.shape(hidden_state)[0], 1, 1), minval=0, maxval=1) + rnd = tf.math.greater(rnd, self.drop_rate) + rnd = tf.math.divide(rnd, (1 - self.drop_rate)) hidden_state = hidden_state + self.module(hidden_state) * rnd return hidden_state else: @@ -428,9 +449,9 @@ def forward(self, hidden_state): return hidden_state -class LevitStage(nn.Module): +class TFLevitStage(tf.keras.layers.Layer): """ - LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers. + LeViT Stage consisting of `TFLevitMLPLayer` and `TFLevitAttention` layers. """ def __init__( @@ -445,29 +466,32 @@ def __init__( mlp_ratio, down_ops, resolution_in, + **kwargs, ): - super().__init__() + super().__init__(**kwargs) self.layers = [] self.config = config self.resolution_in = resolution_in # resolution_in is the intial resolution, resolution_out is final resolution after downsampling - for _ in range(depths): + + # TODO ariG23498: add the index values to the layer names + for idx in range(depths): self.layers.append( - LevitResidualLayer( - LevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), + TFLevitResidualLayer( + TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), self.config.drop_path_rate, ) ) if mlp_ratio > 0: hidden_dim = hidden_sizes * mlp_ratio self.layers.append( - LevitResidualLayer(LevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate) + TFLevitResidualLayer(TFLevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate) ) if down_ops[0] == "Subsample": self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1 self.layers.append( - LevitAttentionSubsample( + TFLevitAttentionSubsample( *self.config.hidden_sizes[idx : idx + 2], key_dim=down_ops[1], num_attention_heads=down_ops[2], @@ -481,36 +505,35 @@ def __init__( if down_ops[4] > 0: hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4] self.layers.append( - LevitResidualLayer( - LevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate + TFLevitResidualLayer( + TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate ) ) - self.layers = nn.ModuleList(self.layers) - def get_resolution(self): return self.resolution_in - def forward(self, hidden_state): + def call(self, hidden_state): for layer in self.layers: hidden_state = layer(hidden_state) return hidden_state -class LevitEncoder(nn.Module): +class TFLevitEncoder(tf.keras.layers.Layer): """ - LeViT Encoder consisting of multiple `LevitStage` stages. + LeViT Encoder consisting of multiple `TFLevitStage` stages. """ - def __init__(self, config): - super().__init__() + def __init__(self, config, **kwargs): + super().__init__(**kwargs) self.config = config resolution = self.config.image_size // self.config.patch_size self.stages = [] self.config.down_ops.append([""]) + # TODO ariG23498: add the index values to the layer names for stage_idx in range(len(config.depths)): - stage = LevitStage( + stage = TFLevitStage( config, stage_idx, config.hidden_sizes[stage_idx], @@ -525,9 +548,7 @@ def __init__(self, config): resolution = stage.get_resolution() self.stages.append(stage) - self.stages = nn.ModuleList(self.stages) - - def forward(self, hidden_state, output_hidden_states=False, return_dict=True): + def call(self, hidden_state, output_hidden_states=False, return_dict=True, training=None): all_hidden_states = () if output_hidden_states else None for stage in self.stages: @@ -540,10 +561,10 @@ def forward(self, hidden_state, output_hidden_states=False, return_dict=True): if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states] if v is not None) - return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states) + return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states) -class LevitClassificationLayer(nn.Module): +class TFLevitClassificationLayer(tf.keras.layers.Layer): """ LeViT Classification Layer """ @@ -559,7 +580,46 @@ def forward(self, hidden_state): return logits -class LevitPreTrainedModel(PreTrainedModel): +@keras_serializable +class TFLeViTMainLayer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + self.config = config + self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings") + self.encoder = TFLevitEncoder(config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: tf.Tensor = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embeddings = self.patch_embeddings(pixel_values) + encoder_outputs = self.encoder( + embeddings, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + + # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) + pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + +class TFLevitPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. @@ -568,108 +628,161 @@ class LevitPreTrainedModel(PreTrainedModel): config_class = LevitConfig base_model_prefix = "levit" main_input_name = "pixel_values" - supports_gradient_checkpointing = True - def _init_weights(self, module): - """Initialize the weights""" - if isinstance(module, (nn.Linear, nn.Conv2d)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)): - module.bias.data.zero_() - module.weight.data.fill_(1.0) + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform( + shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32 + ) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, LevitModel): - module.gradient_checkpointing = value + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + + return self.serving_output(output) LEVIT_START_DOCSTRING = r""" - This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it - as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. - Parameters: - config ([`LevitConfig`]): Model configuration class with all the parameters of the model. + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`ViTConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. """ LEVIT_INPUTS_DOCSTRING = r""" Args: - pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See - [`AutoFeatureExtractor.__call__`] for details. + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See + [`ViTFeatureExtractor.__call__`] for details. + head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + interpolate_pos_encoding (`bool`, *optional*): + Whether to interpolate the pre-trained position encodings. return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). """ + @add_start_docstrings( "The bare Levit model outputting raw features without any specific head on top.", LEVIT_START_DOCSTRING, ) -class LevitModel(LevitPreTrainedModel): - def __init__(self, config): - super().__init__(config) - self.config = config - self.patch_embeddings = LevitPatchEmbeddings(config) - self.encoder = LevitEncoder(config) - # Initialize weights and apply final processing - self.post_init() +class TFLevitModel(TFLevitPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.levit = TFLevitMainLayer(config=config, name="levit") + @unpack_inputs @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_FEAT_EXTRACTOR_FOR_DOC, checkpoint=_CHECKPOINT_FOR_DOC, - output_type=BaseModelOutputWithPoolingAndNoAttention, + output_type=TFBaseModelOutputWithPoolingAndNoAttention, config_class=_CONFIG_FOR_DOC, modality="vision", expected_output=_EXPECTED_OUTPUT_SHAPE, ) - def forward( + def call( self, - pixel_values: torch.FloatTensor = None, + pixel_values: tf.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - embeddings = self.patch_embeddings(pixel_values) - encoder_outputs = self.encoder( - embeddings, + outputs = self.levit( + pixel_values=pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, ) - last_hidden_state = encoder_outputs[0] - - # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) - pooled_output = last_hidden_state.mean(dim=1) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPoolingAndNoAttention( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, + return outputs + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, ) + @add_start_docstrings( """ Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for @@ -677,50 +790,46 @@ def forward( """, LEVIT_START_DOCSTRING, ) -class LevitForImageClassification(LevitPreTrainedModel): - def __init__(self, config): - super().__init__(config) +class TFLevitForImageClassification(TFLevitPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) self.config = config self.num_labels = config.num_labels - self.levit = LevitModel(config) + self.levit = TFLeViTMainLayer(config, name="levit") # Classifier head self.classifier = ( - LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 - else torch.nn.Identity() + else tf.identity ) - # Initialize weights and apply final processing - self.post_init() - + @unpack_inputs @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_FEAT_EXTRACTOR_FOR_DOC, checkpoint=_IMAGE_CLASS_CHECKPOINT, - output_type=ImageClassifierOutputWithNoAttention, + output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC, expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, ) - def forward( + def call( self, - pixel_values: torch.FloatTensor = None, - labels: Optional[torch.LongTensor] = None, + pixel_values: tf.Tensor = None, + labels: Optional[tf.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): r""" - labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): Labels for computing the image classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output = outputs[0] - sequence_output = sequence_output.mean(1) + sequence_output = tf.math.reduce_mean(sequence_output, axis=1) logits = self.classifier(sequence_output) loss = None @@ -728,28 +837,29 @@ def forward( if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" - + # TODO @ariG23498: Check the implementation of the loss fucntions for the + # various problem types if self.config.problem_type == "regression": - loss_fct = MSELoss() + loss_fct = MeanSquaredError() if self.num_labels == 1: - loss = loss_fct(logits.squeeze(), labels.squeeze()) + loss = loss_fct(tf.squeeze(logits), tf.squeeze(labels)) else: loss = loss_fct(logits, labels) elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss_fct = CategoricalCrossentropy() + loss = loss_fct(tf.reshape(logits, shape=(-1, self.num_labels)), tf.flatten(labels)) elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() + loss_fct = BinaryCrossentropy() loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output - return ImageClassifierOutputWithNoAttention( + return TFImageClassifierOutputWithNoAttention( loss=loss, logits=logits, hidden_states=outputs.hidden_states, @@ -765,48 +875,44 @@ def forward( """, LEVIT_START_DOCSTRING, ) -class LevitForImageClassificationWithTeacher(LevitPreTrainedModel): - def __init__(self, config): - super().__init__(config) +class TFLevitForImageClassificationWithTeacher(TFLevitPreTrainedModel): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) self.config = config self.num_labels = config.num_labels - self.levit = LevitModel(config) + self.levit = TFLeViTMainLayer(config, name="levit") # Classifier head self.classifier = ( - LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier") if config.num_labels > 0 - else torch.nn.Identity() + else tf.identity ) self.classifier_distill = ( - LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier_distill") if config.num_labels > 0 - else torch.nn.Identity() + else tf.identity ) - - # Initialize weights and apply final processing - self.post_init() - + + @unpack_inputs @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) @add_code_sample_docstrings( processor_class=_FEAT_EXTRACTOR_FOR_DOC, checkpoint=_IMAGE_CLASS_CHECKPOINT, - output_type=LevitForImageClassificationWithTeacherOutput, + output_type=TFLevitForImageClassificationWithTeacherOutput, config_class=_CONFIG_FOR_DOC, expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, ) - def forward( + def call( self, - pixel_values: torch.FloatTensor = None, + pixel_values: tf.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ): - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output = outputs[0] - sequence_output = sequence_output.mean(1) + sequence_output = tf.math.reduce_mean(sequence_output, axis=1) cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output) logits = (cls_logits + distill_logits) / 2 @@ -814,7 +920,7 @@ def forward( output = (logits, cls_logits, distill_logits) + outputs[2:] return output - return LevitForImageClassificationWithTeacherOutput( + return TFLevitForImageClassificationWithTeacherOutput( logits=logits, cls_logits=cls_logits, distillation_logits=distill_logits, From 94b35d24d001f400fb9128d62712daadab74e7e2 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Sat, 8 Oct 2022 09:30:52 +0530 Subject: [PATCH 03/11] chore: adding training and other nits to TF --- .../models/levit/modeling_tf_levit.py | 92 +++++++++++-------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 6b1de7a42815..679f804e5d42 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -25,6 +25,7 @@ from ...modeling_outputs import ModelOutput from ...modeling_tf_outputs import ( TFBaseModelOutputWithNoAttention, + TFBaseModelOutputWithPooling, TFBaseModelOutputWithPoolingAndNoAttention, TFImageClassifierOutputWithNoAttention, ) @@ -416,9 +417,9 @@ class TFLevitMLPLayer(tf.keras.layers.Layer): def __init__(self, input_dim, hidden_dim, **kwargs): super().__init__(**kwargs) - self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim) + self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim, name="linear_up") self.activation = hard_swish - self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim) + self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down") def call(self, hidden_state): hidden_state = self.linear_up(hidden_state) @@ -474,18 +475,22 @@ def __init__( self.resolution_in = resolution_in # resolution_in is the intial resolution, resolution_out is final resolution after downsampling - # TODO ariG23498: add the index values to the layer names for idx in range(depths): self.layers.append( TFLevitResidualLayer( TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), self.config.drop_path_rate, + name=f"layers.{idx}", ) ) if mlp_ratio > 0: hidden_dim = hidden_sizes * mlp_ratio self.layers.append( - TFLevitResidualLayer(TFLevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate) + TFLevitResidualLayer( + TFLevitMLPLayer(hidden_sizes, hidden_dim), + self.config.drop_path_rate, + name=f"layers.{idx}", + ) ) if down_ops[0] == "Subsample": @@ -499,6 +504,7 @@ def __init__( stride=down_ops[5], resolution_in=resolution_in, resolution_out=self.resolution_out, + name=f"layers.{idx}", ) ) self.resolution_in = self.resolution_out @@ -506,7 +512,9 @@ def __init__( hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4] self.layers.append( TFLevitResidualLayer( - TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate + TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), + self.config.drop_path_rate, + name=f"layers.{idx}", ) ) @@ -544,6 +552,7 @@ def __init__(self, config, **kwargs): config.mlp_ratio[stage_idx], config.down_ops[stage_idx], resolution, + name=f"stages.{stage_idx}" ) resolution = stage.get_resolution() self.stages.append(stage) @@ -571,17 +580,19 @@ class TFLevitClassificationLayer(tf.keras.layers.Layer): def __init__(self, input_dim, output_dim): super().__init__() - self.batch_norm = nn.BatchNorm1d(input_dim) - self.linear = nn.Linear(input_dim, output_dim) - def forward(self, hidden_state): + # The epsilon and momentum used here are the defaults in torch batch norm layer. + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + self.linear = tf.keras.layers.Dense(units=output_dim, name="linear") + + def call(self, hidden_state, training=None): hidden_state = self.batch_norm(hidden_state) logits = self.linear(hidden_state) return logits @keras_serializable -class TFLeViTMainLayer(tf.keras.layers.Layer): +class TFLevitMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.config = config @@ -594,15 +605,17 @@ def call( pixel_values: tf.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + training: Optional[bool] = None, ): if pixel_values is None: raise ValueError("You have to specify pixel_values") - embeddings = self.patch_embeddings(pixel_values) + embeddings = self.patch_embeddings(pixel_values, training=training) encoder_outputs = self.encoder( embeddings, output_hidden_states=output_hidden_states, return_dict=return_dict, + training=training, ) last_hidden_state = encoder_outputs[0] @@ -699,36 +712,21 @@ def serving(self, inputs): Args: - config ([`ViTConfig`]): Model configuration class with all the parameters of the model. + config ([`LevitConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the - configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ LEVIT_INPUTS_DOCSTRING = r""" Args: - pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): - Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See - [`ViTFeatureExtractor.__call__`] for details. - - head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): - Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: - - - 1 indicates the head is **not masked**, - - 0 indicates the head is **masked**. - - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the - config will be used instead. + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See + [`AutoFeatureExtractor.__call__`] for details. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. This argument can be used only in eager mode, in graph mode the value in the config will be - used instead. - interpolate_pos_encoding (`bool`, *optional*): - Whether to interpolate the pre-trained position encodings. + more detail. return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in - eager mode, in graph mode the value will always be set to True. + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. training (`bool`, *optional*, defaults to `False``): Whether or not to use the model in training mode (some modules like dropout modules have different behaviors between training and evaluation). @@ -761,16 +759,19 @@ def call( pixel_values: tf.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + training: Optional[bool] = None, ): outputs = self.levit( pixel_values=pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict, + training=training, ) return outputs - def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + # TODO @ariG23498: Check the output type for serving. + def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -795,11 +796,11 @@ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.config = config self.num_labels = config.num_labels - self.levit = TFLeViTMainLayer(config, name="levit") + self.levit = TFLevitMainLayer(config, name="levit") # Classifier head self.classifier = ( - TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels) + TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier") if config.num_labels > 0 else tf.identity ) @@ -819,6 +820,7 @@ def call( labels: Optional[tf.Tensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + training: Optional[bool] = None, ): r""" labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): @@ -826,7 +828,12 @@ def call( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.levit( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) sequence_output = outputs[0] sequence_output = tf.math.reduce_mean(sequence_output, axis=1) @@ -841,8 +848,7 @@ def call( self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" - # TODO @ariG23498: Check the implementation of the loss fucntions for the - # various problem types + if self.config.problem_type == "regression": loss_fct = MeanSquaredError() if self.num_labels == 1: @@ -880,7 +886,7 @@ def __init__(self, config, **kwargs): super().__init__(config, **kwargs) self.config = config self.num_labels = config.num_labels - self.levit = TFLeViTMainLayer(config, name="levit") + self.levit = TFLevitMainLayer(config, name="levit") # Classifier head self.classifier = ( @@ -908,8 +914,14 @@ def call( pixel_values: tf.Tensor = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + training: Optional[bool] = None, ): - outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + outputs = self.levit( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) sequence_output = outputs[0] sequence_output = tf.math.reduce_mean(sequence_output, axis=1) From 793385dca8fc0a99f4890e019941f63671285b22 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Sat, 8 Oct 2022 14:33:16 +0530 Subject: [PATCH 04/11] chore: adding non trainable variables and training flag --- .../models/levit/modeling_tf_levit.py | 196 +++++++++++------- 1 file changed, 118 insertions(+), 78 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 679f804e5d42..b4277c069c62 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -17,6 +17,7 @@ import itertools from dataclasses import dataclass from typing import Optional, Tuple, Dict +from numpy import indices import tensorflow as tf from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy @@ -87,16 +88,7 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer): """ def __init__( - self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation=1, - groups=1, - bn_weight_init=1, - **kwargs, + self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, **kwargs, ): super().__init__(**kwargs) self.convolution = tf.keras.layers.Conv2D( @@ -107,15 +99,15 @@ def __init__( dilation_rate=dilation, groups=groups, bias=False, - data_format="channels_first", # required for tf + data_format="channels_first", name="convolution", ) # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, embeddings): - embeddings = self.convolution(embeddings) - embeddings = self.batch_norm(embeddings) + def call(self, embeddings, training=None): + embeddings = self.convolution(embeddings, training=training) + embeddings = self.batch_norm(embeddings, training=training) return embeddings @@ -133,59 +125,61 @@ class TFLevitPatchEmbeddings(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) self.embedding_layer_1 = TFLevitConvEmbeddings( - config.num_channels, - config.hidden_sizes[0] // 8, - config.kernel_size, - config.stride, - config.padding, + in_channels=config.num_channels, + out_channels=config.hidden_sizes[0] // 8, + kernel_size=config.kernel_size, + stride=config.stride, + padding=config.padding, name="embedding_layer_1", ) self.activation_layer_1 = hard_swish self.embedding_layer_2 = TFLevitConvEmbeddings( - config.hidden_sizes[0] // 8, - config.hidden_sizes[0] // 4, - config.kernel_size, - config.stride, - config.padding, + in_channels=config.hidden_sizes[0] // 8, + out_channels=config.hidden_sizes[0] // 4, + kernel_size=config.kernel_size, + stride=config.stride, + padding=config.padding, name="embedding_layer_2", ) self.activation_layer_2 = hard_swish self.embedding_layer_3 = TFLevitConvEmbeddings( - config.hidden_sizes[0] // 4, - config.hidden_sizes[0] // 2, - config.kernel_size, - config.stride, - config.padding, + in_channels=config.hidden_sizes[0] // 4, + out_channels=config.hidden_sizes[0] // 2, + kernel_size=config.kernel_size, + stride=config.stride, + padding=config.padding, name="embedding_layer_3", ) self.activation_layer_3 = hard_swish self.embedding_layer_4 = TFLevitConvEmbeddings( - config.hidden_sizes[0] // 2, - config.hidden_sizes[0], - config.kernel_size, - config.stride, - config.padding, + in_channels=config.hidden_sizes[0] // 2, + out_channels=config.hidden_sizes[0], + kernel_size=config.kernel_size, + stride=config.stride, + padding=config.padding, name="embedding_layer_4", ) self.num_channels = config.num_channels - def call(self, pixel_values): + def call(self, pixel_values, training=None): batch_size = tf.shape(pixel_values)[0] num_channels = tf.shape(pixel_values)[1] + if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." ) - embeddings = self.embedding_layer_1(pixel_values) + + embeddings = self.embedding_layer_1(pixel_values, training=training) embeddings = self.activation_layer_1(embeddings) - embeddings = self.embedding_layer_2(embeddings) + embeddings = self.embedding_layer_2(embeddings, training=training) embeddings = self.activation_layer_2(embeddings) - embeddings = self.embedding_layer_3(embeddings) + embeddings = self.embedding_layer_3(embeddings, training=training) embeddings = self.activation_layer_3(embeddings) - embeddings = self.embedding_layer_4(embeddings) + embeddings = self.embedding_layer_4(embeddings, training=training) # Flatten the embeddings flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1)) # Transpose the channel and spatial axis of the flattened embeddings @@ -200,19 +194,24 @@ def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs): # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, hidden_state): + def call(self, hidden_state, training=None): num_channels = tf.shape(hidden_state)[2] - hidden_state = self.linear(hidden_state) + hidden_state = self.linear(hidden_state, training=training) + # Before sending the hidden state to the batch normalization layer, we would have to # flatten the hidden states in the batch and seq len dimension flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels)) - batch_norm_hidden_state = self.batch_norm(flattened_hidden_state) + batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training) + # Reshape the output of batch norm to have the same shape as the original hidden state hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state)) return hidden_state class TFLevitSubsample(tf.keras.layers.Layer): + """ + Layer to subsample the activatioin maps + """ def __init__(self, stride, resolution, **kwargs): super().__init__() self.stride = stride @@ -221,11 +220,13 @@ def __init__(self, stride, resolution, **kwargs): def call(self, hidden_state): batch_size = tf.shape(hidden_state)[0] channels = tf.shape(hidden_state)[2] + reshaped_hidden_state = tf.reshape( hidden_state, shape=(batch_size, self.resolution, self.resolution, channels) ) strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride] hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels)) + return hidden_state @@ -243,15 +244,23 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, self.activation = hard_swish self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection") + # Build tuples of points in the entire resolution range of the pixel values points = list(itertools.product(range(resolution), range(resolution))) - len_points = len(points) + self.len_points = len(points) + + # Initialize the attention offsets and indices attention_offsets, indices = {}, [] - for p1 in points: - for p2 in points: + + # Iterate over the points generator and calculate the offset between the initial + # point (0, 0) and the rest of the points [(0, 1), (0, 2)...] + for p1 in points: # this iterates only once + for p2 in points: # iterate over all the points other than (0, 0) offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) if offset not in attention_offsets: attention_offsets[offset] = len(attention_offsets) indices.append(attention_offsets[offset]) + + # Store the attention offsets, indices and attention bias cache self.attention_offsets = attention_offsets self.indices = indices self.attention_bias_cache = {} @@ -263,6 +272,12 @@ def build(self, input_shape): trainable=True, name="attention_biases", ) + self.attention_bias_idxs = tf.Variable( + initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)), + trainable=False, # this is a registered buffer and not a parameter + dtype=tf.float32, + name="attention_bias_idxs", + ) super().build(input_shape) # TODO @ariG23498 @@ -272,16 +287,16 @@ def train(self, mode=True): if mode and self.attention_bias_cache: self.attention_bias_cache = {} # clear ab cache - def get_attention_biases(self, device, attention_bias_idxs, training=None): + def get_attention_biases(self, device, training=None): if training: - return self.attention_biases[:, attention_bias_idxs] + return self.attention_biases[:, self.attention_bias_idxs] else: device_key = str(device) if device_key not in self.attention_bias_cache: self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] return self.attention_bias_cache[device_key] - def call(self, hidden_state, attention_bias_idxs, training=None): + def call(self, hidden_state, training=None): batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] queries_keys_values = self.queries_keys_values(hidden_state) @@ -300,7 +315,7 @@ def call(self, hidden_state, attention_bias_idxs, training=None): value = tf.transpose(value, perm=(0, 2, 1, 3)) attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases( - hidden_state.device, attention_bias_idxs, training=training + hidden_state.device, training=training ) attention = stable_softmax(attention, axis=-1) hidden_state = tf.matmul(attention, value) @@ -342,7 +357,7 @@ def __init__( points = list(itertools.product(range(resolution_in), range(resolution_in))) points_ = list(itertools.product(range(resolution_out), range(resolution_out))) - len_points, len_points_ = len(points), len(points_) + self.len_points, self.len_points_ = len(points), len(points_) attention_offsets, indices = {}, [] for p1 in points_: for p2 in points: @@ -353,6 +368,7 @@ def __init__( indices.append(attention_offsets[offset]) self.attention_offsets = attention_offsets + self.indices = indices def build(self, input_shape): self.attention_biases = self.add_weight( @@ -361,6 +377,13 @@ def build(self, input_shape): trainable=True, name="attention_biases", ) + + self.attention_bias_idxs = tf.Variable( + initial_value=tf.reshape(self.indices, (self.len_points_, self.len_points)), + trainable=False, + dtype=tf.float32, + name="attention_bias_idxs", + ) super().build(input_shape) # TODO @ariG23498 @@ -370,23 +393,22 @@ def train(self, mode=True): if mode and self.attention_bias_cache: self.attention_bias_cache = {} # clear ab cache - def get_attention_biases(self, device, attention_bias_idxs, training=None): + def get_attention_biases(self, device, training=None): if training: - return self.attention_biases[:, attention_bias_idxs] + return self.attention_biases[:, self.attention_bias_idxs] else: device_key = str(device) if device_key not in self.attention_bias_cache: self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] return self.attention_bias_cache[device_key] - def call(self, hidden_state, attention_bias_idxs, training=None): + def call(self, hidden_state, training=None): batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] - + # Process the hidden states and reshape it reshaped_hidden_state = tf.reshape( - self.keys_values(hidden_state), - shape=(batch_size, seq_length, self.num_attention_heads, -1) + self.keys_values(hidden_state), shape=(batch_size, seq_length, self.num_attention_heads, -1) ) # Split the reshaped hidden state into key and value key, value = tf.split( @@ -402,11 +424,13 @@ def call(self, hidden_state, attention_bias_idxs, training=None): query = tf.transpose(query, perm=(0, 2, 1, 3)) attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases( - hidden_state.device, attention_bias_idxs, training=training + hidden_state.device, training=training ) - attention = attention.softmax(dim=-1) - hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection) - hidden_state = self.projection(self.activation(hidden_state)) + attention = stable_softmax(attention, axis=-1) + hidden_state = tf.matmul(attention, value) + hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3)) + hidden_state = tf.reshape(hidden_state, (batch_size, -1, self.out_dim_projection)) + hidden_state = self.projection(self.activation(hidden_state), training=training) return hidden_state @@ -474,7 +498,7 @@ def __init__( self.config = config self.resolution_in = resolution_in # resolution_in is the intial resolution, resolution_out is final resolution after downsampling - + for idx in range(depths): self.layers.append( TFLevitResidualLayer( @@ -552,7 +576,7 @@ def __init__(self, config, **kwargs): config.mlp_ratio[stage_idx], config.down_ops[stage_idx], resolution, - name=f"stages.{stage_idx}" + name=f"stages.{stage_idx}", ) resolution = stage.get_resolution() self.stages.append(stage) @@ -583,11 +607,11 @@ def __init__(self, input_dim, output_dim): # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - self.linear = tf.keras.layers.Dense(units=output_dim, name="linear") + self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear") def call(self, hidden_state, training=None): - hidden_state = self.batch_norm(hidden_state) - logits = self.linear(hidden_state) + hidden_state = self.batch_norm(hidden_state, training=training) + logits = self.linear(hidden_state, training=training) return logits @@ -610,7 +634,10 @@ def call( if pixel_values is None: raise ValueError("You have to specify pixel_values") + # Apply patch embeddings to the pixel values embeddings = self.patch_embeddings(pixel_values, training=training) + + # Apply encoder to the encoded pixel values encoder_outputs = self.encoder( embeddings, output_hidden_states=output_hidden_states, @@ -618,7 +645,8 @@ def call( training=training, ) - last_hidden_state = encoder_outputs[0] + # Obtain the `last_hidden_state` + last_hidden_state = encoder_outputs[0] # encoder_outputs.last_hidden_state # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1) @@ -629,9 +657,10 @@ def call( return TFBaseModelOutputWithPoolingAndNoAttention( last_hidden_state=last_hidden_state, pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, + hidden_states=encoder_outputs.hidden_states, # only if the `output_hidden_states` is set to True ) + class TFLevitPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -733,7 +762,6 @@ def serving(self, inputs): """ - @add_start_docstrings( "The bare Levit model outputting raw features without any specific head on top.", LEVIT_START_DOCSTRING, @@ -741,7 +769,7 @@ def serving(self, inputs): class TFLevitModel(TFLevitPreTrainedModel): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) - + self.levit = TFLevitMainLayer(config=config, name="levit") @unpack_inputs @@ -769,7 +797,7 @@ def call( ) return outputs - + # TODO @ariG23498: Check the output type for serving. def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None @@ -783,7 +811,6 @@ def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> ) - @add_start_docstrings( """ Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for @@ -828,6 +855,7 @@ def call( config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ + # Get the outputs from the levit main layer outputs = self.levit( pixel_values=pixel_values, output_hidden_states=output_hidden_states, @@ -835,15 +863,19 @@ def call( training=training, ) - sequence_output = outputs[0] + # Get the `last_hidden_state` and average it along the number of sequences + sequence_output = outputs[0] # outputs.last_hidden_state sequence_output = tf.math.reduce_mean(sequence_output, axis=1) - logits = self.classifier(sequence_output) + + # Apply the classifier head and obtain the logits + logits = self.classifier(sequence_output, training=training) loss = None if labels is not None: if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" + # TODO @ariG23498: Check with the dtypes (long and int in torch) elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64): self.config.problem_type = "single_label_classification" else: @@ -868,7 +900,7 @@ def call( return TFImageClassifierOutputWithNoAttention( loss=loss, logits=logits, - hidden_states=outputs.hidden_states, + hidden_states=outputs.hidden_states, # only if `output_hidden_states` flag is set to True ) @@ -899,7 +931,7 @@ def __init__(self, config, **kwargs): if config.num_labels > 0 else tf.identity ) - + @unpack_inputs @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING) @add_code_sample_docstrings( @@ -916,6 +948,7 @@ def call( return_dict: Optional[bool] = None, training: Optional[bool] = None, ): + # Get the output from the levit main layer outputs = self.levit( pixel_values=pixel_values, output_hidden_states=output_hidden_states, @@ -923,9 +956,16 @@ def call( training=training, ) - sequence_output = outputs[0] + # Get the `last_hidden_state` and average it along the number of sequences + sequence_output = outputs[0] # outputs.last_hidden_state sequence_output = tf.math.reduce_mean(sequence_output, axis=1) - cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output) + + # Apply the classifier heads and obtain the `cls_logits` and `distill_logits` + cls_logits, distill_logits = self.classifier(sequence_output, training=training), self.classifier_distill( + sequence_output, training=training + ) + + # According to the paper, the cls and distill logits are averaged logits = (cls_logits + distill_logits) / 2 if not return_dict: @@ -936,5 +976,5 @@ def call( logits=logits, cls_logits=cls_logits, distillation_logits=distill_logits, - hidden_states=outputs.hidden_states, + hidden_states=outputs.hidden_states, # only if `output_hidden_states` flag is set to True ) From 7982dea98140035755eef15bc999aecca3ff2a99 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Sat, 8 Oct 2022 15:05:29 +0530 Subject: [PATCH 05/11] chore: adapting till TFLevitStage --- .../models/levit/modeling_tf_levit.py | 48 +++++++++++-------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index b4277c069c62..200114dfac5b 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -95,10 +95,10 @@ def __init__( filters=out_channels, kernel_size=kernel_size, strides=stride, - padding=padding, + padding=(padding, padding), # TODO @ariG23498: Make sure the padding is a tuple dilation_rate=dilation, groups=groups, - bias=False, + use_bias=False, data_format="channels_first", name="convolution", ) @@ -190,7 +190,7 @@ def call(self, pixel_values, training=None): class TFMLPLayerWithBN(tf.keras.layers.Layer): def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs): super().__init__(**kwargs) - self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear") + self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") @@ -280,12 +280,12 @@ def build(self, input_shape): ) super().build(input_shape) - # TODO @ariG23498 - @torch.no_grad() - def train(self, mode=True): - super().train(mode) - if mode and self.attention_bias_cache: - self.attention_bias_cache = {} # clear ab cache + # # TODO @ariG23498 + # @torch.no_grad() + # def train(self, mode=True): + # super().train(mode) + # if mode and self.attention_bias_cache: + # self.attention_bias_cache = {} # clear ab cache def get_attention_biases(self, device, training=None): if training: @@ -386,12 +386,12 @@ def build(self, input_shape): ) super().build(input_shape) - # TODO @ariG23498 - @torch.no_grad() - def train(self, mode=True): - super().train(mode) - if mode and self.attention_bias_cache: - self.attention_bias_cache = {} # clear ab cache + # # TODO @ariG23498 + # @torch.no_grad() + # def train(self, mode=True): + # super().train(mode) + # if mode and self.attention_bias_cache: + # self.attention_bias_cache = {} # clear ab cache def get_attention_biases(self, device, training=None): if training: @@ -445,10 +445,10 @@ def __init__(self, input_dim, hidden_dim, **kwargs): self.activation = hard_swish self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down") - def call(self, hidden_state): - hidden_state = self.linear_up(hidden_state) + def call(self, hidden_state, training=None): + hidden_state = self.linear_up(hidden_state, training=training) hidden_state = self.activation(hidden_state) - hidden_state = self.linear_down(hidden_state) + hidden_state = self.linear_down(hidden_state, training=training) return hidden_state @@ -518,10 +518,14 @@ def __init__( ) if down_ops[0] == "Subsample": + + print("info", self.config.hidden_sizes) + print("info", idx) self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1 self.layers.append( TFLevitAttentionSubsample( - *self.config.hidden_sizes[idx : idx + 2], + input_dim=self.config.hidden_sizes[idx], + output_dim=self.config.hidden_sizes[idx + 1], key_dim=down_ops[1], num_attention_heads=down_ops[2], attention_ratio=down_ops[3], @@ -607,7 +611,7 @@ def __init__(self, input_dim, output_dim): # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear") + self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") def call(self, hidden_state, training=None): hidden_state = self.batch_norm(hidden_state, training=training) @@ -617,8 +621,10 @@ def call(self, hidden_state, training=None): @keras_serializable class TFLevitMainLayer(tf.keras.layers.Layer): + config_class = LevitConfig + def __init__(self, config, **kwargs): - super().__init__(config, **kwargs) + super().__init__(**kwargs) self.config = config self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings") self.encoder = TFLevitEncoder(config, name="encoder") From 57f5f74dc3e7036a1137e92ca6afaf4b330a85eb Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Sun, 9 Oct 2022 12:02:03 +0530 Subject: [PATCH 06/11] chore: aligning till attention biases --- .../models/levit/modeling_tf_levit.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 200114dfac5b..77820d7388ea 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -16,12 +16,11 @@ import itertools from dataclasses import dataclass -from typing import Optional, Tuple, Dict -from numpy import indices +from typing import Dict, Optional, Tuple import tensorflow as tf -from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy from tensorflow.keras import backend as K +from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, MeanSquaredError from ...modeling_outputs import ModelOutput from ...modeling_tf_outputs import ( @@ -59,7 +58,7 @@ @dataclass class TFLevitForImageClassificationWithTeacherOutput(ModelOutput): """ - Output type of [`LevitForImageClassificationWithTeacher`]. + Output type of [`TFLevitForImageClassificationWithTeacher`]. Args: logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): @@ -95,18 +94,20 @@ def __init__( filters=out_channels, kernel_size=kernel_size, strides=stride, - padding=(padding, padding), # TODO @ariG23498: Make sure the padding is a tuple + padding="SAME", # TODO @ariG23498: Make sure the padding is a tuple dilation_rate=dilation, groups=groups, use_bias=False, - data_format="channels_first", + data_format="channels_last", name="convolution", ) # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") def call(self, embeddings, training=None): + embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1)) embeddings = self.convolution(embeddings, training=training) + embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2)) embeddings = self.batch_norm(embeddings, training=training) return embeddings @@ -181,6 +182,7 @@ def call(self, pixel_values, training=None): embeddings = self.activation_layer_3(embeddings) embeddings = self.embedding_layer_4(embeddings, training=training) # Flatten the embeddings + num_channels = tf.shape(embeddings)[1] flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1)) # Transpose the channel and spatial axis of the flattened embeddings transpose_embeddings = tf.transpose(flattended_embeddings, perm=(0, 2, 1)) @@ -275,7 +277,7 @@ def build(self, input_shape): self.attention_bias_idxs = tf.Variable( initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)), trainable=False, # this is a registered buffer and not a parameter - dtype=tf.float32, + dtype=tf.int32, name="attention_bias_idxs", ) super().build(input_shape) @@ -293,6 +295,8 @@ def get_attention_biases(self, device, training=None): else: device_key = str(device) if device_key not in self.attention_bias_cache: + print("INFO biases cache", self.attention_biases.shape) + print("INFO biases index", self.attention_bias_idxs.shape) self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] return self.attention_bias_cache[device_key] @@ -381,7 +385,7 @@ def build(self, input_shape): self.attention_bias_idxs = tf.Variable( initial_value=tf.reshape(self.indices, (self.len_points_, self.len_points)), trainable=False, - dtype=tf.float32, + dtype=tf.int32, name="attention_bias_idxs", ) super().build(input_shape) @@ -498,13 +502,12 @@ def __init__( self.config = config self.resolution_in = resolution_in # resolution_in is the intial resolution, resolution_out is final resolution after downsampling - - for idx in range(depths): + for index in range(depths): self.layers.append( TFLevitResidualLayer( TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), self.config.drop_path_rate, - name=f"layers.{idx}", + name=f"layers.{index}", ) ) if mlp_ratio > 0: @@ -513,19 +516,15 @@ def __init__( TFLevitResidualLayer( TFLevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate, - name=f"layers.{idx}", + name=f"layers.{index}", ) ) if down_ops[0] == "Subsample": - - print("info", self.config.hidden_sizes) - print("info", idx) self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1 self.layers.append( TFLevitAttentionSubsample( - input_dim=self.config.hidden_sizes[idx], - output_dim=self.config.hidden_sizes[idx + 1], + *self.config.hidden_sizes[idx : idx + 2], key_dim=down_ops[1], num_attention_heads=down_ops[2], attention_ratio=down_ops[3], From fc816813f1a200da519dc9f2dab077a7c8317494 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Mon, 21 Nov 2022 21:19:07 +0530 Subject: [PATCH 07/11] chore: adding padding before conv in TFLevitConvEmbeddings --- src/transformers/models/levit/modeling_tf_levit.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 77820d7388ea..9abf8aebd7df 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -87,14 +87,15 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer): """ def __init__( - self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, **kwargs, + self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, *args, **kwargs, ): - super().__init__(**kwargs) + super().__init__(*args, **kwargs) + # The padding layer is built in order to pad the inputs before entering the convolution operation. + self.padding = tf.keras.layers.ZeroPadding2D(padding=padding) self.convolution = tf.keras.layers.Conv2D( filters=out_channels, kernel_size=kernel_size, strides=stride, - padding="SAME", # TODO @ariG23498: Make sure the padding is a tuple dilation_rate=dilation, groups=groups, use_bias=False, @@ -104,8 +105,9 @@ def __init__( # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, embeddings, training=None): + def call(self, embeddings: tf.Tensor, training: Optional[bool]=None): embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1)) + embeddings = self.padding(embeddings) embeddings = self.convolution(embeddings, training=training) embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2)) embeddings = self.batch_norm(embeddings, training=training) From 876294a6dc59c6c85d3fbb6317defe1f74590a2a Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Mon, 21 Nov 2022 22:09:28 +0530 Subject: [PATCH 08/11] chore: modification to the reshape operation in TFMLPLayerWithBN --- .../models/levit/modeling_tf_levit.py | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 9abf8aebd7df..f2f12cf4dbb6 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -125,8 +125,8 @@ class TFLevitPatchEmbeddings(tf.keras.layers.Layer): `TFLevitConvEmbeddings`. """ - def __init__(self, config, **kwargs): - super().__init__(**kwargs) + def __init__(self, config, *args, **kwargs): + super().__init__(*args, **kwargs) self.embedding_layer_1 = TFLevitConvEmbeddings( in_channels=config.num_channels, out_channels=config.hidden_sizes[0] // 8, @@ -167,7 +167,7 @@ def __init__(self, config, **kwargs): ) self.num_channels = config.num_channels - def call(self, pixel_values, training=None): + def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None): batch_size = tf.shape(pixel_values)[0] num_channels = tf.shape(pixel_values)[1] @@ -183,6 +183,7 @@ def call(self, pixel_values, training=None): embeddings = self.embedding_layer_3(embeddings, training=training) embeddings = self.activation_layer_3(embeddings) embeddings = self.embedding_layer_4(embeddings, training=training) + # Flatten the embeddings num_channels = tf.shape(embeddings)[1] flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1)) @@ -192,23 +193,28 @@ def call(self, pixel_values, training=None): class TFMLPLayerWithBN(tf.keras.layers.Layer): - def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs): - super().__init__(**kwargs) - self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") + def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs): + super().__init__(*args, **kwargs) + self.linear = tf.keras.layers.Dense( + units=output_dim, + use_bias=False, + name="linear" + ) # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, hidden_state, training=None): - num_channels = tf.shape(hidden_state)[2] + def call(self, hidden_state: tf.Tensor, training: Optional[bool]=None): hidden_state = self.linear(hidden_state, training=training) # Before sending the hidden state to the batch normalization layer, we would have to - # flatten the hidden states in the batch and seq len dimension - flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels)) + # flatten the hidden states with start=0 and end=1. + hidden_state_shape_list = shape_list(hidden_state) + hidden_state_reshape_list = [hidden_state_shape_list[0] * hidden_state_shape_list[1]] + hidden_state_shape_list[2:] + flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list) batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training) # Reshape the output of batch norm to have the same shape as the original hidden state - hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state)) + hidden_state = tf.reshape(batch_norm_hidden_state, shape=shape_list(hidden_state)) return hidden_state From 8bbc04743d731d74576fd98ec79bc7c7c38b4e5f Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Wed, 23 Nov 2022 13:30:56 +0530 Subject: [PATCH 09/11] chore: all the variables of LeViT model are ported in TF --- .../models/levit/modeling_tf_levit.py | 380 ++++++++++-------- 1 file changed, 213 insertions(+), 167 deletions(-) diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index f2f12cf4dbb6..4c8af385b2e5 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -16,7 +16,7 @@ import itertools from dataclasses import dataclass -from typing import Dict, Optional, Tuple +from typing import Dict, Optional, Tuple, Union import tensorflow as tf from tensorflow.keras import backend as K @@ -70,9 +70,9 @@ class token). Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the distillation token). hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer - plus the initial embedding outputs. + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. """ logits: tf.Tensor = None @@ -87,7 +87,17 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer): """ def __init__( - self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, *args, **kwargs, + self, + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=1, + groups=1, + bn_weight_init=1, + *args, + **kwargs, ): super().__init__(*args, **kwargs) # The padding layer is built in order to pad the inputs before entering the convolution operation. @@ -105,12 +115,12 @@ def __init__( # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, embeddings: tf.Tensor, training: Optional[bool]=None): + def call(self, embeddings: tf.Tensor, training: Optional[bool] = None): embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1)) embeddings = self.padding(embeddings) embeddings = self.convolution(embeddings, training=training) - embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2)) embeddings = self.batch_norm(embeddings, training=training) + embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2)) return embeddings @@ -167,15 +177,15 @@ def __init__(self, config, *args, **kwargs): ) self.num_channels = config.num_channels - def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None): + def call(self, pixel_values: tf.Tensor, training: Optional[bool] = None): batch_size = tf.shape(pixel_values)[0] num_channels = tf.shape(pixel_values)[1] - + if num_channels != self.num_channels: raise ValueError( "Make sure that the channel dimension of the pixel values match with the one set in the configuration." ) - + embeddings = self.embedding_layer_1(pixel_values, training=training) embeddings = self.activation_layer_1(embeddings) embeddings = self.embedding_layer_2(embeddings, training=training) @@ -183,7 +193,7 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None): embeddings = self.embedding_layer_3(embeddings, training=training) embeddings = self.activation_layer_3(embeddings) embeddings = self.embedding_layer_4(embeddings, training=training) - + # Flatten the embeddings num_channels = tf.shape(embeddings)[1] flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1)) @@ -195,24 +205,22 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None): class TFMLPLayerWithBN(tf.keras.layers.Layer): def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs): super().__init__(*args, **kwargs) - self.linear = tf.keras.layers.Dense( - units=output_dim, - use_bias=False, - name="linear" - ) + self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - def call(self, hidden_state: tf.Tensor, training: Optional[bool]=None): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): hidden_state = self.linear(hidden_state, training=training) - + # Before sending the hidden state to the batch normalization layer, we would have to # flatten the hidden states with start=0 and end=1. hidden_state_shape_list = shape_list(hidden_state) - hidden_state_reshape_list = [hidden_state_shape_list[0] * hidden_state_shape_list[1]] + hidden_state_shape_list[2:] + hidden_state_reshape_list = [ + hidden_state_shape_list[0] * hidden_state_shape_list[1] + ] + hidden_state_shape_list[2:] flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list) batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training) - + # Reshape the output of batch norm to have the same shape as the original hidden state hidden_state = tf.reshape(batch_norm_hidden_state, shape=shape_list(hidden_state)) return hidden_state @@ -222,27 +230,28 @@ class TFLevitSubsample(tf.keras.layers.Layer): """ Layer to subsample the activatioin maps """ - def __init__(self, stride, resolution, **kwargs): - super().__init__() + + def __init__(self, stride, resolution, *args, **kwargs): + super().__init__(*args, **kwargs) self.stride = stride self.resolution = resolution - def call(self, hidden_state): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): batch_size = tf.shape(hidden_state)[0] channels = tf.shape(hidden_state)[2] - + reshaped_hidden_state = tf.reshape( hidden_state, shape=(batch_size, self.resolution, self.resolution, channels) ) strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride] hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels)) - + return hidden_state class TFLevitAttention(tf.keras.layers.Layer): - def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, **kwargs): - super().__init__(**kwargs) + def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, *args, **kwargs): + super().__init__(*args, **kwargs) self.num_attention_heads = num_attention_heads self.scale = key_dim**-0.5 self.key_dim = key_dim @@ -250,9 +259,13 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads * 2 self.out_dim_projection = attention_ratio * key_dim * num_attention_heads - self.queries_keys_values = TFMLPLayerWithBN(hidden_sizes, self.out_dim_keys_values, name="queries_keys_values") + self.queries_keys_values = TFMLPLayerWithBN( + input_dim=hidden_sizes, output_dim=self.out_dim_keys_values, name="queries_keys_values" + ) self.activation = hard_swish - self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection") + self.projection = TFMLPLayerWithBN( + input_dim=self.out_dim_projection, output_dim=hidden_sizes, bn_weight_init=0, name="projection" + ) # Build tuples of points in the entire resolution range of the pixel values points = list(itertools.product(range(resolution), range(resolution))) @@ -261,21 +274,21 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, # Initialize the attention offsets and indices attention_offsets, indices = {}, [] - # Iterate over the points generator and calculate the offset between the initial + # Iterate over the `points`` generator and calculate the offset between the initial # point (0, 0) and the rest of the points [(0, 1), (0, 2)...] - for p1 in points: # this iterates only once - for p2 in points: # iterate over all the points other than (0, 0) + for p1 in points: # this iterates only once, wehre p1 is (0, 0) + for p2 in points: # iterate over all the points other than (0, 0) offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) if offset not in attention_offsets: attention_offsets[offset] = len(attention_offsets) indices.append(attention_offsets[offset]) - + # Store the attention offsets, indices and attention bias cache self.attention_offsets = attention_offsets self.indices = indices self.attention_bias_cache = {} - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.attention_biases = self.add_weight( shape=(self.num_attention_heads, len(self.attention_offsets)), initializer="zeros", @@ -284,39 +297,38 @@ def build(self, input_shape): ) self.attention_bias_idxs = tf.Variable( initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)), - trainable=False, # this is a registered buffer and not a parameter + trainable=False, # this is a registered buffer and not a parameter dtype=tf.int32, name="attention_bias_idxs", ) super().build(input_shape) - # # TODO @ariG23498 - # @torch.no_grad() - # def train(self, mode=True): - # super().train(mode) - # if mode and self.attention_bias_cache: - # self.attention_bias_cache = {} # clear ab cache - - def get_attention_biases(self, device, training=None): + def get_attention_biases(self, device, training: Optional[bool] = None): if training: - return self.attention_biases[:, self.attention_bias_idxs] + return tf.gather(self.attention_biases, self.attention_bias_idxs, axis=1) else: device_key = str(device) if device_key not in self.attention_bias_cache: - print("INFO biases cache", self.attention_biases.shape) - print("INFO biases index", self.attention_bias_idxs.shape) - self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] + self.attention_bias_cache[device_key] = tf.gather( + self.attention_biases, self.attention_bias_idxs, axis=1 + ) return self.attention_bias_cache[device_key] - def call(self, hidden_state, training=None): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): + + # TODO: figure out the clearing cache mechanism + if training and self.attention_bias_cache: + self.attention_bias_cache = {} # clear ab cache + batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] queries_keys_values = self.queries_keys_values(hidden_state) - # Reshape queries_keys_values + # Reshape `queries_keys_values`. reshaped_queries_keys_values = tf.reshape( queries_keys_values, shape=(batch_size, seq_length, self.num_attention_heads, -1) ) + # Split the reshaped tensor into query, key, and value. query, key, value = tf.split( value=reshaped_queries_keys_values, num_or_size_splits=[self.key_dim, self.key_dim, self.attention_ratio * self.key_dim], @@ -348,9 +360,10 @@ def __init__( stride, resolution_in, resolution_out, + *args, **kwargs, ): - super().__init__(**kwargs) + super().__init__(*args, **kwargs) self.num_attention_heads = num_attention_heads self.scale = key_dim**-0.5 self.key_dim = key_dim @@ -359,11 +372,13 @@ def __init__( self.out_dim_projection = attention_ratio * key_dim * num_attention_heads self.resolution_out = resolution_out # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling - self.keys_values = TFMLPLayerWithBN(input_dim, self.out_dim_keys_values, name="keys_values") - self.queries_subsample = TFLevitSubsample(stride, resolution_in, name="queries_subsample") - self.queries = TFMLPLayerWithBN(input_dim, key_dim * num_attention_heads, name="queries") + self.keys_values = TFMLPLayerWithBN( + input_dim=input_dim, output_dim=self.out_dim_keys_values, name="keys_values" + ) + self.queries_subsample = TFLevitSubsample(stride=stride, resolution=resolution_in, name="queries_subsample") + self.queries = TFMLPLayerWithBN(input_dim=input_dim, output_dim=key_dim * num_attention_heads, name="queries") self.activation = hard_swish - self.projection = TFMLPLayerWithBN(self.out_dim_projection, output_dim, name="projection") + self.projection = TFMLPLayerWithBN(input_dim=self.out_dim_projection, output_dim=output_dim, name="projection") self.attention_bias_cache = {} @@ -382,7 +397,7 @@ def __init__( self.attention_offsets = attention_offsets self.indices = indices - def build(self, input_shape): + def build(self, input_shape: tf.TensorShape): self.attention_biases = self.add_weight( shape=(self.num_attention_heads, len(self.attention_offsets)), initializer="zeros", @@ -398,23 +413,23 @@ def build(self, input_shape): ) super().build(input_shape) - # # TODO @ariG23498 - # @torch.no_grad() - # def train(self, mode=True): - # super().train(mode) - # if mode and self.attention_bias_cache: - # self.attention_bias_cache = {} # clear ab cache - - def get_attention_biases(self, device, training=None): + def get_attention_biases(self, device, training: Optional[bool] = None): if training: - return self.attention_biases[:, self.attention_bias_idxs] + return tf.gather(self.attention_biases, self.attention_bias_idxs, axis=1) else: device_key = str(device) if device_key not in self.attention_bias_cache: - self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] + self.attention_bias_cache[device_key] = tf.gather( + self.attention_biases, self.attention_bias_idxs, axis=1 + ) return self.attention_bias_cache[device_key] - def call(self, hidden_state, training=None): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): + + # TODO: figure out the clearing cache mechanism + if training and self.attention_bias_cache: + self.attention_bias_cache = {} # clear ab cache + batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] @@ -451,13 +466,13 @@ class TFLevitMLPLayer(tf.keras.layers.Layer): MLP Layer with `2X` expansion in contrast to ViT with `4X`. """ - def __init__(self, input_dim, hidden_dim, **kwargs): - super().__init__(**kwargs) - self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim, name="linear_up") + def __init__(self, input_dim, hidden_dim, *args, **kwargs): + super().__init__(*args, **kwargs) + self.linear_up = TFMLPLayerWithBN(input_dim=input_dim, output_dim=hidden_dim, name="linear_up") self.activation = hard_swish - self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down") + self.linear_down = TFMLPLayerWithBN(input_dim=hidden_dim, output_dim=input_dim, name="linear_down") - def call(self, hidden_state, training=None): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): hidden_state = self.linear_up(hidden_state, training=training) hidden_state = self.activation(hidden_state) hidden_state = self.linear_down(hidden_state, training=training) @@ -469,16 +484,18 @@ class TFLevitResidualLayer(tf.keras.layers.Layer): Residual Block for TFLeViT """ - def __init__(self, module, drop_rate, **kwargs): - super().__init__(**kwargs) + def __init__(self, module, drop_rate, *args, **kwargs): + super().__init__(*args, **kwargs) self.module = module self.drop_rate = drop_rate - def call(self, hidden_state, training=None): - if training and self.drop_rate > 0: + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): + if training and self.drop_rate > 0.0: rnd = tf.random.normal(shape=(tf.shape(hidden_state)[0], 1, 1), minval=0, maxval=1) rnd = tf.math.greater(rnd, self.drop_rate) rnd = tf.math.divide(rnd, (1 - self.drop_rate)) + # Detach the gradient from `rnd`. + tf.stop_gradient(rnd) hidden_state = hidden_state + self.module(hidden_state) * rnd return hidden_state else: @@ -503,30 +520,45 @@ def __init__( mlp_ratio, down_ops, resolution_in, + *args, **kwargs, ): - super().__init__(**kwargs) + super().__init__(*args, **kwargs) self.layers = [] self.config = config self.resolution_in = resolution_in - # resolution_in is the intial resolution, resolution_out is final resolution after downsampling - for index in range(depths): + # `resolution_in` is the intial resolution, `resolution_out` is final resolution after downsampling + index = 0 + for _ in range(depths): self.layers.append( TFLevitResidualLayer( - TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in), - self.config.drop_path_rate, + module=TFLevitAttention( + hidden_sizes=hidden_sizes, + key_dim=key_dim, + num_attention_heads=num_attention_heads, + attention_ratio=attention_ratio, + resolution=resolution_in, + name="module", + ), + drop_rate=self.config.drop_path_rate, name=f"layers.{index}", ) ) + index += 1 # Increment the index by 1 if mlp_ratio > 0: hidden_dim = hidden_sizes * mlp_ratio self.layers.append( TFLevitResidualLayer( - TFLevitMLPLayer(hidden_sizes, hidden_dim), - self.config.drop_path_rate, + module=TFLevitMLPLayer( + input_dim=hidden_sizes, + hidden_dim=hidden_dim, + name="module", + ), + drop_rate=self.config.drop_path_rate, name=f"layers.{index}", ) ) + index += 1 # Increment the index by 1 if down_ops[0] == "Subsample": self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1 @@ -539,24 +571,28 @@ def __init__( stride=down_ops[5], resolution_in=resolution_in, resolution_out=self.resolution_out, - name=f"layers.{idx}", + name=f"layers.{index}", ) ) + index += 1 # Increment the index by 1 self.resolution_in = self.resolution_out if down_ops[4] > 0: hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4] self.layers.append( TFLevitResidualLayer( - TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), - self.config.drop_path_rate, - name=f"layers.{idx}", - ) + module=TFLevitMLPLayer( + input_dim=self.config.hidden_sizes[idx + 1], hidden_dim=hidden_dim, name="module" + ), + drop_rate=self.config.drop_path_rate, + name=f"layers.{index}", + ), ) + index += 1 # Increment the index by 1 def get_resolution(self): return self.resolution_in - def call(self, hidden_state): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): for layer in self.layers: hidden_state = layer(hidden_state) return hidden_state @@ -567,38 +603,43 @@ class TFLevitEncoder(tf.keras.layers.Layer): LeViT Encoder consisting of multiple `TFLevitStage` stages. """ - def __init__(self, config, **kwargs): - super().__init__(**kwargs) + def __init__(self, config, *args, **kwargs): + super().__init__(*args, **kwargs) self.config = config resolution = self.config.image_size // self.config.patch_size self.stages = [] self.config.down_ops.append([""]) - # TODO ariG23498: add the index values to the layer names for stage_idx in range(len(config.depths)): stage = TFLevitStage( - config, - stage_idx, - config.hidden_sizes[stage_idx], - config.key_dim[stage_idx], - config.depths[stage_idx], - config.num_attention_heads[stage_idx], - config.attention_ratio[stage_idx], - config.mlp_ratio[stage_idx], - config.down_ops[stage_idx], - resolution, + config=config, + idx=stage_idx, + hidden_sizes=config.hidden_sizes[stage_idx], + key_dim=config.key_dim[stage_idx], + depths=config.depths[stage_idx], + num_attention_heads=config.num_attention_heads[stage_idx], + attention_ratio=config.attention_ratio[stage_idx], + mlp_ratio=config.mlp_ratio[stage_idx], + down_ops=config.down_ops[stage_idx], + resolution_in=resolution, name=f"stages.{stage_idx}", ) resolution = stage.get_resolution() self.stages.append(stage) - def call(self, hidden_state, output_hidden_states=False, return_dict=True, training=None): + def call( + self, + hidden_state: tf.Tensor, + output_hidden_states: bool = False, + return_dict: bool = True, + training: Optional[bool] = None, + ): all_hidden_states = () if output_hidden_states else None for stage in self.stages: if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - hidden_state = stage(hidden_state) + hidden_state = stage(hidden_state, training=training) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) @@ -613,67 +654,19 @@ class TFLevitClassificationLayer(tf.keras.layers.Layer): LeViT Classification Layer """ - def __init__(self, input_dim, output_dim): - super().__init__() + def __init__(self, input_dim, output_dim, *args, **kwargs): + super().__init__(*args, **kwargs) # The epsilon and momentum used here are the defaults in torch batch norm layer. self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") - self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") + self.linear = tf.keras.layers.Dense(units=output_dim, name="linear") - def call(self, hidden_state, training=None): + def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): hidden_state = self.batch_norm(hidden_state, training=training) logits = self.linear(hidden_state, training=training) return logits -@keras_serializable -class TFLevitMainLayer(tf.keras.layers.Layer): - config_class = LevitConfig - - def __init__(self, config, **kwargs): - super().__init__(**kwargs) - self.config = config - self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings") - self.encoder = TFLevitEncoder(config, name="encoder") - - @unpack_inputs - def call( - self, - pixel_values: tf.Tensor = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - training: Optional[bool] = None, - ): - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - - # Apply patch embeddings to the pixel values - embeddings = self.patch_embeddings(pixel_values, training=training) - - # Apply encoder to the encoded pixel values - encoder_outputs = self.encoder( - embeddings, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - training=training, - ) - - # Obtain the `last_hidden_state` - last_hidden_state = encoder_outputs[0] # encoder_outputs.last_hidden_state - - # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) - pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return TFBaseModelOutputWithPoolingAndNoAttention( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, # only if the `output_hidden_states` is set to True - ) - - class TFLevitPreTrainedModel(TFPreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained @@ -717,6 +710,54 @@ def serving(self, inputs): return self.serving_output(output) +@keras_serializable +class TFLevitMainLayer(tf.keras.layers.Layer): + config_class = LevitConfig + + def __init__(self, config, *args, **kwargs): + super().__init__(*args, **kwargs) + self.config = config + self.patch_embeddings = TFLevitPatchEmbeddings(config=config, name="patch_embeddings") + self.encoder = TFLevitEncoder(config=config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: tf.Tensor = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndNoAttention]: + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + # Apply patch embeddings to the pixel values + embeddings = self.patch_embeddings(pixel_values, training=training) + + # Apply encoder to the encoded pixel values + encoder_outputs = self.encoder( + embeddings, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + # Obtain the `last_hidden_state` + last_hidden_state = encoder_outputs[0] # encoder_outputs.last_hidden_state + + # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes) + pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, # only if the `output_hidden_states` is set to True + ) + + LEVIT_START_DOCSTRING = r""" This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -780,8 +821,8 @@ def serving(self, inputs): LEVIT_START_DOCSTRING, ) class TFLevitModel(TFLevitPreTrainedModel): - def __init__(self, config, **kwargs): - super().__init__(config, **kwargs) + def __init__(self, config, *args, **kwargs): + super().__init__(config, *args, **kwargs) self.levit = TFLevitMainLayer(config=config, name="levit") @@ -832,15 +873,17 @@ def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> LEVIT_START_DOCSTRING, ) class TFLevitForImageClassification(TFLevitPreTrainedModel): - def __init__(self, config, **kwargs): - super().__init__(config, **kwargs) + def __init__(self, config, *args, **kwargs): + super().__init__(config, *args, **kwargs) self.config = config self.num_labels = config.num_labels - self.levit = TFLevitMainLayer(config, name="levit") + self.levit = TFLevitMainLayer(config=config, name="levit") # Classifier head self.classifier = ( - TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier") + TFLevitClassificationLayer( + input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier" + ) if config.num_labels > 0 else tf.identity ) @@ -888,8 +931,7 @@ def call( if self.config.problem_type is None: if self.num_labels == 1: self.config.problem_type = "regression" - # TODO @ariG23498: Check with the dtypes (long and int in torch) - elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64): + elif self.num_labels > 1 and (labels.dtype == tf.int64 or labels.dtype == tf.int32): self.config.problem_type = "single_label_classification" else: self.config.problem_type = "multi_label_classification" @@ -927,20 +969,24 @@ def call( LEVIT_START_DOCSTRING, ) class TFLevitForImageClassificationWithTeacher(TFLevitPreTrainedModel): - def __init__(self, config, **kwargs): - super().__init__(config, **kwargs) + def __init__(self, config, *args, **kwargs): + super().__init__(config, *args, **kwargs) self.config = config self.num_labels = config.num_labels self.levit = TFLevitMainLayer(config, name="levit") # Classifier head self.classifier = ( - TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier") + TFLevitClassificationLayer( + input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier" + ) if config.num_labels > 0 else tf.identity ) self.classifier_distill = ( - TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier_distill") + TFLevitClassificationLayer( + input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier_distill" + ) if config.num_labels > 0 else tf.identity ) From fdb690756232dc6c683fd21ac122b80509d674f9 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Fri, 25 Nov 2022 15:19:36 +0530 Subject: [PATCH 10/11] chore: making mdx changes and adding the tf model to various inits --- docs/source/en/index.mdx | 2 +- docs/source/en/model_doc/levit.mdx | 3 +- src/transformers/__init__.py | 16 ++++++++++ .../models/auto/modeling_tf_auto.py | 1 + src/transformers/models/levit/__init__.py | 30 ++++++++++++++++++- .../models/levit/modeling_tf_levit.py | 2 +- 6 files changed, 50 insertions(+), 4 deletions(-) diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index 790ce8f4d176..7b9820bb2ad9 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -277,7 +277,7 @@ Flax), PyTorch, and/or TensorFlow. | LayoutLMv2 | ✅ | ✅ | ✅ | ❌ | ❌ | | LayoutLMv3 | ✅ | ✅ | ✅ | ✅ | ❌ | | LED | ✅ | ✅ | ✅ | ✅ | ❌ | -| LeViT | ❌ | ❌ | ✅ | ❌ | ❌ | +| LeViT | ❌ | ❌ | ✅ | ✅ | ❌ | | LiLT | ❌ | ❌ | ✅ | ❌ | ❌ | | Longformer | ✅ | ✅ | ✅ | ✅ | ❌ | | LongT5 | ❌ | ❌ | ✅ | ❌ | ✅ | diff --git a/docs/source/en/model_doc/levit.mdx b/docs/source/en/model_doc/levit.mdx index 1ebe93ff3ff7..017a97af7328 100644 --- a/docs/source/en/model_doc/levit.mdx +++ b/docs/source/en/model_doc/levit.mdx @@ -59,7 +59,8 @@ Tips: - You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (you can just replace [`ViTFeatureExtractor`] by [`LevitFeatureExtractor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]). -This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT). +This model was contributed by [anugunj](https://huggingface.co/anugunj). The TensorFlow version was contributed by +[Aritra Roy Gosthipaty](https://huggingface.co/ariG23498). The original code can be found [here](https://github.com/facebookresearch/LeViT). ## LevitConfig diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 9c5f33bea535..503673648e7f 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2696,6 +2696,15 @@ ] ) _import_structure["models.led"].extend(["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"]) + _import_structure["models.levit"].extend( + [ + "TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFLevitForImageClassification", + "TFLevitForImageClassificationWithTeacher", + "TFLevitModel", + "TFLevitPreTrainedModel", + ] + ) _import_structure["models.longformer"].extend( [ "TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5472,6 +5481,13 @@ TFLayoutLMv3PreTrainedModel, ) from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel + from .models.levit import ( + TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFLevitForImageClassification, + TFLevitForImageClassificationWithTeacher, + TFLevitModel, + TFLevitPreTrainedModel, + ) from .models.longformer import ( TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, TFLongformerForMaskedLM, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 8bb7b5595f35..ad18273430a9 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -57,6 +57,7 @@ ("layoutlm", "TFLayoutLMModel"), ("layoutlmv3", "TFLayoutLMv3Model"), ("led", "TFLEDModel"), + ("levit", "TFLevitModel"), ("longformer", "TFLongformerModel"), ("lxmert", "TFLxmertModel"), ("marian", "TFMarianModel"), diff --git a/src/transformers/models/levit/__init__.py b/src/transformers/models/levit/__init__.py index f42fb02ad071..9cce4e7f3cf8 100644 --- a/src/transformers/models/levit/__init__.py +++ b/src/transformers/models/levit/__init__.py @@ -17,7 +17,7 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_tf_available, is_vision_available _import_structure = {"configuration_levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig", "LevitOnnxConfig"]} @@ -45,6 +45,20 @@ "LevitPreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_levit"] = [ + "TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFLevitForImageClassification", + "TFLevitForImageClassificationWithTeacher", + "TFLevitModel", + "TFLevitPreTrainedModel", + ] + if TYPE_CHECKING: from .configuration_levit import LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, LevitConfig, LevitOnnxConfig @@ -71,6 +85,20 @@ LevitModel, LevitPreTrainedModel, ) + + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_levit import ( + TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST, + TFLevitForImageClassification, + TFLevitForImageClassificationWithTeacher, + TFLevitModel, + TFLevitPreTrainedModel, + ) else: import sys diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 4c8af385b2e5..8fca39106989 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -49,7 +49,7 @@ _IMAGE_CLASS_CHECKPOINT = "facebook/levit-128S" _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" -LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ +TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [ "facebook/levit-128S", # See all LeViT models at https://huggingface.co/models?filter=levit ] From 95ffed10ebb6d79996e0475ec83b4bb40e8615f8 Mon Sep 17 00:00:00 2001 From: ariG23498 Date: Mon, 28 Nov 2022 17:17:48 +0530 Subject: [PATCH 11/11] chore: changing the defaults of BN layers and applying style fixup --- src/transformers/models/levit/__init__.py | 10 ++- .../models/levit/modeling_tf_levit.py | 65 +++++++++++-------- src/transformers/utils/dummy_tf_objects.py | 31 +++++++++ 3 files changed, 76 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/levit/__init__.py b/src/transformers/models/levit/__init__.py index 9cce4e7f3cf8..7a52103e6d4d 100644 --- a/src/transformers/models/levit/__init__.py +++ b/src/transformers/models/levit/__init__.py @@ -17,7 +17,13 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_tf_available, is_vision_available +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_tf_available, + is_torch_available, + is_vision_available, +) _import_structure = {"configuration_levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig", "LevitOnnxConfig"]} @@ -85,7 +91,7 @@ LevitModel, LevitPreTrainedModel, ) - + try: if not is_tf_available(): raise OptionalDependencyNotAvailable() diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py index 8fca39106989..a66f2cd59436 100644 --- a/src/transformers/models/levit/modeling_tf_levit.py +++ b/src/transformers/models/levit/modeling_tf_levit.py @@ -113,13 +113,15 @@ def __init__( name="convolution", ) # The epsilon and momentum used here are the defaults in torch batch norm layer. - self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.9, name="batch_norm") def call(self, embeddings: tf.Tensor, training: Optional[bool] = None): + # embeddings shape = (bsz, num_channels, height, width) embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1)) embeddings = self.padding(embeddings) embeddings = self.convolution(embeddings, training=training) embeddings = self.batch_norm(embeddings, training=training) + # embeddings shape = (bsz, height, width, num_channels) embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2)) return embeddings @@ -205,9 +207,17 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool] = None): class TFMLPLayerWithBN(tf.keras.layers.Layer): def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs): super().__init__(*args, **kwargs) - self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear") + self.linear = tf.keras.layers.Dense( + units=output_dim, + use_bias=False, + name="linear", + ) # The epsilon and momentum used here are the defaults in torch batch norm layer. - self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + self.batch_norm = tf.keras.layers.BatchNormalization( + epsilon=1e-05, + momentum=0.9, + name="batch_norm", + ) def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): hidden_state = self.linear(hidden_state, training=training) @@ -218,6 +228,7 @@ def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): hidden_state_reshape_list = [ hidden_state_shape_list[0] * hidden_state_shape_list[1] ] + hidden_state_shape_list[2:] + flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list) batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training) @@ -228,7 +239,7 @@ def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): class TFLevitSubsample(tf.keras.layers.Layer): """ - Layer to subsample the activatioin maps + Layer to subsample the activatioin maps. """ def __init__(self, stride, resolution, *args, **kwargs): @@ -272,20 +283,18 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, self.len_points = len(points) # Initialize the attention offsets and indices - attention_offsets, indices = {}, [] + self.attention_offsets, self.indices = {}, [] # Iterate over the `points`` generator and calculate the offset between the initial # point (0, 0) and the rest of the points [(0, 1), (0, 2)...] for p1 in points: # this iterates only once, wehre p1 is (0, 0) for p2 in points: # iterate over all the points other than (0, 0) offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) - if offset not in attention_offsets: - attention_offsets[offset] = len(attention_offsets) - indices.append(attention_offsets[offset]) + if offset not in self.attention_offsets: + self.attention_offsets[offset] = len(self.attention_offsets) + self.indices.append(self.attention_offsets[offset]) - # Store the attention offsets, indices and attention bias cache - self.attention_offsets = attention_offsets - self.indices = indices + # Store attention bias cache self.attention_bias_cache = {} def build(self, input_shape: tf.TensorShape): @@ -317,8 +326,8 @@ def get_attention_biases(self, device, training: Optional[bool] = None): def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): # TODO: figure out the clearing cache mechanism - if training and self.attention_bias_cache: - self.attention_bias_cache = {} # clear ab cache + # if training and self.attention_bias_cache: + # self.attention_bias_cache = {} # clear ab cache batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] @@ -427,8 +436,8 @@ def get_attention_biases(self, device, training: Optional[bool] = None): def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): # TODO: figure out the clearing cache mechanism - if training and self.attention_bias_cache: - self.attention_bias_cache = {} # clear ab cache + # if training and self.attention_bias_cache: + # self.attention_bias_cache = {} # clear ab cache batch_size = tf.shape(hidden_state)[0] seq_length = tf.shape(hidden_state)[1] @@ -594,7 +603,7 @@ def get_resolution(self): def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): for layer in self.layers: - hidden_state = layer(hidden_state) + hidden_state = layer(hidden_state, training=training) return hidden_state @@ -658,7 +667,7 @@ def __init__(self, input_dim, output_dim, *args, **kwargs): super().__init__(*args, **kwargs) # The epsilon and momentum used here are the defaults in torch batch norm layer. - self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm") + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.9, name="batch_norm") self.linear = tf.keras.layers.Dense(units=output_dim, name="linear") def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None): @@ -823,7 +832,6 @@ def call( class TFLevitModel(TFLevitPreTrainedModel): def __init__(self, config, *args, **kwargs): super().__init__(config, *args, **kwargs) - self.levit = TFLevitMainLayer(config=config, name="levit") @unpack_inputs @@ -849,10 +857,8 @@ def call( return_dict=return_dict, training=training, ) - return outputs - # TODO @ariG23498: Check the output type for serving. def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling: hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None @@ -885,7 +891,7 @@ def __init__(self, config, *args, **kwargs): input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier" ) if config.num_labels > 0 - else tf.identity + else tf.keras.layers.Activation("linear", name="classifier") ) @unpack_inputs @@ -978,17 +984,21 @@ def __init__(self, config, *args, **kwargs): # Classifier head self.classifier = ( TFLevitClassificationLayer( - input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier" + input_dim=config.hidden_sizes[-1], + output_dim=config.num_labels, + name="classifier", ) if config.num_labels > 0 - else tf.identity + else tf.keras.layers.Activation("linear", name="classifier") ) self.classifier_distill = ( TFLevitClassificationLayer( - input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier_distill" + input_dim=config.hidden_sizes[-1], + output_dim=config.num_labels, + name="classifier_distill", ) if config.num_labels > 0 - else tf.identity + else tf.keras.layers.Activation("linear", name="classifier_distill") ) @unpack_inputs @@ -1020,9 +1030,8 @@ def call( sequence_output = tf.math.reduce_mean(sequence_output, axis=1) # Apply the classifier heads and obtain the `cls_logits` and `distill_logits` - cls_logits, distill_logits = self.classifier(sequence_output, training=training), self.classifier_distill( - sequence_output, training=training - ) + cls_logits = self.classifier(sequence_output, training=training) + distill_logits = self.classifier_distill(sequence_output, training=training) # According to the paper, the cls and distill logits are averaged logits = (cls_logits + distill_logits) / 2 diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index d16a75591d62..a72fa1db2461 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -1499,6 +1499,37 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFLevitForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLevitForImageClassificationWithTeacher(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLevitModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFLevitPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None