From 301f96c351da23d0778ebc7a141e01603ff9b9fd Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Thu, 22 Sep 2022 10:03:08 +0530
Subject: [PATCH 01/11] chore: initial commit

---
 .../models/levit/modeling_tf_levit.py         | 822 ++++++++++++++++++
 1 file changed, 822 insertions(+)
 create mode 100644 src/transformers/models/levit/modeling_tf_levit.py

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
new file mode 100644
index 000000000000..02cffe532d52
--- /dev/null
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -0,0 +1,822 @@
+# coding=utf-8
+# Copyright 2022 Meta Platforms, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TensorFlow LeViT model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import tensorflow as tf
+from tensorflow.keras import backend as K
+
+from ...modeling_outputs import ModelOutput
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithNoAttention,
+    TFBaseModelOutputWithPoolingAndNoAttention,
+    TFImageClassifierOutputWithNoAttention,
+)
+from ...modeling_tf_utils import TFPreTrainedModel
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_levit import LevitConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "LevitConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "LevitFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/levit-128S"
+_EXPECTED_OUTPUT_SHAPE = [1, 16, 384]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "facebook/levit-128S"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/levit-128S",
+    # See all LeViT models at https://huggingface.co/models?filter=levit
+]
+
+
+@dataclass
+class TFLevitForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Output type of [`LevitForImageClassificationWithTeacher`].
+
+    Args:
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the `cls_logits` and `distillation_logits`.
+        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+    """
+
+    logits: tf.Tensor = None
+    cls_logits: tf.Tensor = None
+    distillation_logits: tf.Tensor = None
+    hidden_states: Optional[Tuple[tf.Tensor]] = None
+
+
+class TFLevitConvEmbeddings(tf.keras.layers.Layer):
+    """
+    LeViT Conv Embeddings with Batch Norm, used in the initial patch embedding layer.
+    """
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation=1,
+        groups=1,
+        bn_weight_init=1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.convolution = tf.keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=kernel_size,
+            strides=stride,
+            padding=padding,
+            dilation_rate=dilation,
+            groups=groups,
+            bias=False,
+            data_format="channels_first",  # required for tf
+            name="convolution",
+        )
+        # The epsilon and momentum used here are the defaults in torch batch norm layer.
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+
+    def call(self, embeddings):
+        embeddings = self.convolution(embeddings)
+        embeddings = self.batch_norm(embeddings)
+        return embeddings
+
+
+# Defining hard swish with keras backend.
+def hard_swish(x):
+    return x * (K.relu(x + 3.0, max_value=6.0) / 6.0)
+
+
+class TFLevitPatchEmbeddings(tf.keras.layers.Layer):
+    """
+    LeViT patch embeddings, for final embeddings to be passed to transformer blocks. It consists of multiple
+    `TFLevitConvEmbeddings`.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding_layer_1 = TFLevitConvEmbeddings(
+            config.num_channels,
+            config.hidden_sizes[0] // 8,
+            config.kernel_size,
+            config.stride,
+            config.padding,
+            name="embedding_layer_1",
+        )
+        self.activation_layer_1 = hard_swish
+
+        self.embedding_layer_2 = TFLevitConvEmbeddings(
+            config.hidden_sizes[0] // 8,
+            config.hidden_sizes[0] // 4,
+            config.kernel_size,
+            config.stride,
+            config.padding,
+            name="embedding_layer_2",
+        )
+        self.activation_layer_2 = hard_swish
+
+        self.embedding_layer_3 = TFLevitConvEmbeddings(
+            config.hidden_sizes[0] // 4,
+            config.hidden_sizes[0] // 2,
+            config.kernel_size,
+            config.stride,
+            config.padding,
+            name="embedding_layer_3",
+        )
+        self.activation_layer_3 = hard_swish
+
+        self.embedding_layer_4 = TFLevitConvEmbeddings(
+            config.hidden_sizes[0] // 2,
+            config.hidden_sizes[0],
+            config.kernel_size,
+            config.stride,
+            config.padding,
+            name="embedding_layer_4",
+        )
+        self.num_channels = config.num_channels
+
+    def call(self, pixel_values):
+        batch_size = tf.shape(pixel_values)[0]
+        num_channels = tf.shape(pixel_values)[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.embedding_layer_1(pixel_values)
+        embeddings = self.activation_layer_1(embeddings)
+        embeddings = self.embedding_layer_2(embeddings)
+        embeddings = self.activation_layer_2(embeddings)
+        embeddings = self.embedding_layer_3(embeddings)
+        embeddings = self.activation_layer_3(embeddings)
+        embeddings = self.embedding_layer_4(embeddings)
+        # Flatten the embeddings
+        flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1))
+        # Transpose the channel and spatial axis of the flattened embeddings
+        transpose_embeddings = tf.transpose(flattended_embeddings, perm=(0, 2, 1))
+        return transpose_embeddings
+
+
+class TFMLPLayerWithBN(tf.keras.layers.Layer):
+    def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs):
+        super().__init__(**kwargs)
+        self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear")
+        # The epsilon and momentum used here are the defaults in torch batch norm layer.
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+
+    def call(self, hidden_state):
+        num_channels = tf.shape(hidden_state)[2]
+        hidden_state = self.linear(hidden_state)
+        # Before sending the hidden state to the batch normalization layer, we would have to
+        # flatten the hidden states in the batch and seq len dimension
+        flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels))
+        batch_norm_hidden_state = self.batch_norm(flattened_hidden_state)
+        # Reshape the output of batch norm to have the same shape as the original hidden state
+        hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state))
+        return hidden_state
+
+
+class TFLevitSubsample(tf.keras.layers.Layer):
+    def __init__(self, stride, resolution, **kwargs):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def call(self, hidden_state):
+        batch_size = tf.shape(hidden_state)[0]
+        channels = tf.shape(hidden_state)[2]
+        reshaped_hidden_state = tf.reshape(
+            hidden_state, shape=(batch_size, self.resolution, self.resolution, channels)
+        )
+        strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride]
+        hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels))
+        return hidden_state
+
+
+class TFLevitAttention(tf.keras.layers.Layer):
+    def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, **kwargs):
+        super().__init__(**kwargs)
+        self.num_attention_heads = num_attention_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads * 2
+        self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
+
+        self.queries_keys_values = TFMLPLayerWithBN(hidden_sizes, self.out_dim_keys_values, name="queries_keys_values")
+        self.activation = hard_swish
+        self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection")
+
+        points = list(itertools.product(range(resolution), range(resolution)))
+        len_points = len(points)
+        attention_offsets, indices = {}, []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                indices.append(attention_offsets[offset])
+        self.attention_offsets = attention_offsets
+        self.indices = indices
+        self.attention_bias_cache = {}
+
+    def build(self, input_shape):
+        self.attention_biases = self.add_weight(
+            shape=(self.num_attention_heads, len(self.attention_offsets)),
+            initializer="zeros",
+            trainable=True,
+            name="attention_biases",
+        )
+        super().build(input_shape)
+
+    # Todo: @ariG23498
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
+    def get_attention_biases(self, device, attention_bias_idxs, training=None):
+        if training:
+            return self.attention_biases[:, attention_bias_idxs]
+        else:
+            device_key = str(device)
+            if device_key not in self.attention_bias_cache:
+                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+            return self.attention_bias_cache[device_key]
+
+    def call(self, hidden_state, attention_bias_idxs, training=None):
+        batch_size = tf.shape(hidden_state)[0]
+        seq_length = tf.shape(hidden_state)[1]
+        queries_keys_values = self.queries_keys_values(hidden_state)
+
+        # Reshape queries_keys_values
+        reshaped_queries_keys_values = tf.reshape(
+            queries_keys_values, shape=(batch_size, seq_length, self.num_attention_heads, -1)
+        )
+        query, key, value = tf.split(
+            value=reshaped_queries_keys_values,
+            num_or_size_splits=[self.key_dim, self.key_dim, self.attention_ratio * self.key_dim],
+            axis=3,
+        )
+        query = tf.transpose(query, perm=(0, 2, 1, 3))
+        key = tf.transpose(key, perm=(0, 2, 1, 3))
+        value = tf.transpose(value, perm=(0, 2, 1, 3))
+
+        attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases(
+            hidden_state.device, attention_bias_idxs, training=training
+        )
+        attention = stable_softmax(attention, axis=-1)
+        hidden_state = tf.matmul(attention, value)
+        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, seq_length, self.out_dim_projection))
+        hidden_state = self.projection(self.activation(hidden_state))
+        return hidden_state
+
+
+class LevitAttentionSubsample(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        key_dim,
+        num_attention_heads,
+        attention_ratio,
+        stride,
+        resolution_in,
+        resolution_out,
+    ):
+        super().__init__()
+        self.num_attention_heads = num_attention_heads
+        self.scale = key_dim**-0.5
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads
+        self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
+        self.resolution_out = resolution_out
+        # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
+        self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
+        self.queries_subsample = LevitSubsample(stride, resolution_in)
+        self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
+        self.activation = nn.Hardswish()
+        self.projection = MLPLayerWithBN(self.out_dim_projection, output_dim)
+
+        self.attention_bias_cache = {}
+
+        points = list(itertools.product(range(resolution_in), range(resolution_in)))
+        points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
+        len_points, len_points_ = len(points), len(points_)
+        attention_offsets, indices = {}, []
+        for p1 in points_:
+            for p2 in points:
+                size = 1
+                offset = (abs(p1[0] * stride - p2[0] + (size - 1) / 2), abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                indices.append(attention_offsets[offset])
+
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
+    def get_attention_biases(self, device):
+        if self.training:
+            return self.attention_biases[:, self.attention_bias_idxs]
+        else:
+            device_key = str(device)
+            if device_key not in self.attention_bias_cache:
+                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+            return self.attention_bias_cache[device_key]
+
+    def forward(self, hidden_state):
+        batch_size, seq_length, _ = hidden_state.shape
+        key, value = (
+            self.keys_values(hidden_state)
+            .view(batch_size, seq_length, self.num_attention_heads, -1)
+            .split([self.key_dim, self.attention_ratio * self.key_dim], dim=3)
+        )
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+
+        query = self.queries(self.queries_subsample(hidden_state))
+        query = query.view(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim).permute(
+            0, 2, 1, 3
+        )
+
+        attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device)
+        attention = attention.softmax(dim=-1)
+        hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection)
+        hidden_state = self.projection(self.activation(hidden_state))
+        return hidden_state
+
+
+class LevitMLPLayer(nn.Module):
+    """
+    MLP Layer with `2X` expansion in contrast to ViT with `4X`.
+    """
+
+    def __init__(self, input_dim, hidden_dim):
+        super().__init__()
+        self.linear_up = MLPLayerWithBN(input_dim, hidden_dim)
+        self.activation = nn.Hardswish()
+        self.linear_down = MLPLayerWithBN(hidden_dim, input_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.linear_up(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.linear_down(hidden_state)
+        return hidden_state
+
+
+class LevitResidualLayer(nn.Module):
+    """
+    Residual Block for LeViT
+    """
+
+    def __init__(self, module, drop_rate):
+        super().__init__()
+        self.module = module
+        self.drop_rate = drop_rate
+
+    def forward(self, hidden_state):
+        if self.training and self.drop_rate > 0:
+            rnd = torch.rand(hidden_state.size(0), 1, 1, device=hidden_state.device)
+            rnd = rnd.ge_(self.drop_rate).div(1 - self.drop_rate).detach()
+            hidden_state = hidden_state + self.module(hidden_state) * rnd
+            return hidden_state
+        else:
+            hidden_state = hidden_state + self.module(hidden_state)
+            return hidden_state
+
+
+class LevitStage(nn.Module):
+    """
+    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
+    """
+
+    def __init__(
+        self,
+        config,
+        idx,
+        hidden_sizes,
+        key_dim,
+        depths,
+        num_attention_heads,
+        attention_ratio,
+        mlp_ratio,
+        down_ops,
+        resolution_in,
+    ):
+        super().__init__()
+        self.layers = []
+        self.config = config
+        self.resolution_in = resolution_in
+        # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
+        for _ in range(depths):
+            self.layers.append(
+                LevitResidualLayer(
+                    LevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
+                    self.config.drop_path_rate,
+                )
+            )
+            if mlp_ratio > 0:
+                hidden_dim = hidden_sizes * mlp_ratio
+                self.layers.append(
+                    LevitResidualLayer(LevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate)
+                )
+
+        if down_ops[0] == "Subsample":
+            self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
+            self.layers.append(
+                LevitAttentionSubsample(
+                    *self.config.hidden_sizes[idx : idx + 2],
+                    key_dim=down_ops[1],
+                    num_attention_heads=down_ops[2],
+                    attention_ratio=down_ops[3],
+                    stride=down_ops[5],
+                    resolution_in=resolution_in,
+                    resolution_out=self.resolution_out,
+                )
+            )
+            self.resolution_in = self.resolution_out
+            if down_ops[4] > 0:
+                hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4]
+                self.layers.append(
+                    LevitResidualLayer(
+                        LevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate
+                    )
+                )
+
+        self.layers = nn.ModuleList(self.layers)
+
+    def get_resolution(self):
+        return self.resolution_in
+
+    def forward(self, hidden_state):
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+class LevitEncoder(nn.Module):
+    """
+    LeViT Encoder consisting of multiple `LevitStage` stages.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        resolution = self.config.image_size // self.config.patch_size
+        self.stages = []
+        self.config.down_ops.append([""])
+
+        for stage_idx in range(len(config.depths)):
+            stage = LevitStage(
+                config,
+                stage_idx,
+                config.hidden_sizes[stage_idx],
+                config.key_dim[stage_idx],
+                config.depths[stage_idx],
+                config.num_attention_heads[stage_idx],
+                config.attention_ratio[stage_idx],
+                config.mlp_ratio[stage_idx],
+                config.down_ops[stage_idx],
+                resolution,
+            )
+            resolution = stage.get_resolution()
+            self.stages.append(stage)
+
+        self.stages = nn.ModuleList(self.stages)
+
+    def forward(self, hidden_state, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+
+        for stage in self.stages:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+            hidden_state = stage(hidden_state)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_state,)
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
+
+
+class LevitClassificationLayer(nn.Module):
+    """
+    LeViT Classification Layer
+    """
+
+    def __init__(self, input_dim, output_dim):
+        super().__init__()
+        self.batch_norm = nn.BatchNorm1d(input_dim)
+        self.linear = nn.Linear(input_dim, output_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.batch_norm(hidden_state)
+        logits = self.linear(hidden_state)
+        return logits
+
+
+class LevitPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LevitConfig
+    base_model_prefix = "levit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LevitModel):
+            module.gradient_checkpointing = value
+
+
+LEVIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`LevitConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+LEVIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Levit model outputting raw features without any specific head on top.",
+    LEVIT_START_DOCSTRING,
+)
+class LevitModel(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.patch_embeddings = LevitPatchEmbeddings(config)
+        self.encoder = LevitEncoder(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embeddings = self.patch_embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            embeddings,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
+        pooled_output = last_hidden_state.mean(dim=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    LEVIT_START_DOCSTRING,
+)
+class LevitForImageClassification(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.levit = LevitModel(config)
+
+        # Classifier head
+        self.classifier = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        sequence_output = outputs[0]
+        sequence_output = sequence_output.mean(1)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    LeViT Model transformer with image classification heads on top (a linear layer on top of the final hidden state and
+    a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet. .. warning::
+           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+           supported.
+    """,
+    LEVIT_START_DOCSTRING,
+)
+class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.num_labels = config.num_labels
+        self.levit = LevitModel(config)
+
+        # Classifier head
+        self.classifier = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+        self.classifier_distill = (
+            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            if config.num_labels > 0
+            else torch.nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=LevitForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        sequence_output = outputs[0]
+        sequence_output = sequence_output.mean(1)
+        cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output)
+        logits = (cls_logits + distill_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distill_logits) + outputs[2:]
+            return output
+
+        return LevitForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distill_logits,
+            hidden_states=outputs.hidden_states,
+        )

From 961a638029cc48626a2a3e4e9640f7e47d75acb2 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Thu, 6 Oct 2022 22:23:37 +0530
Subject: [PATCH 02/11] chore: porting layers into TF

---
 .../models/levit/modeling_tf_levit.py         | 446 +++++++++++-------
 1 file changed, 276 insertions(+), 170 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 02cffe532d52..6b1de7a42815 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -16,9 +16,10 @@
 
 import itertools
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Dict
 
 import tensorflow as tf
+from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy
 from tensorflow.keras import backend as K
 
 from ...modeling_outputs import ModelOutput
@@ -27,7 +28,7 @@
     TFBaseModelOutputWithPoolingAndNoAttention,
     TFImageClassifierOutputWithNoAttention,
 )
-from ...modeling_tf_utils import TFPreTrainedModel
+from ...modeling_tf_utils import TFPreTrainedModel, keras_serializable, unpack_inputs
 from ...tf_utils import shape_list, stable_softmax
 from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
 from .configuration_levit import LevitConfig
@@ -68,7 +69,7 @@ class token).
             Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
             distillation token).
         hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
             shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
             plus the initial embedding outputs.
     """
@@ -263,7 +264,7 @@ def build(self, input_shape):
         )
         super().build(input_shape)
 
-    # Todo: @ariG23498
+    # TODO @ariG23498
     @torch.no_grad()
     def train(self, mode=True):
         super().train(mode)
@@ -308,7 +309,7 @@ def call(self, hidden_state, attention_bias_idxs, training=None):
         return hidden_state
 
 
-class LevitAttentionSubsample(nn.Module):
+class TFLevitAttentionSubsample(tf.keras.layers.Layer):
     def __init__(
         self,
         input_dim,
@@ -319,8 +320,9 @@ def __init__(
         stride,
         resolution_in,
         resolution_out,
+        **kwargs,
     ):
-        super().__init__()
+        super().__init__(**kwargs)
         self.num_attention_heads = num_attention_heads
         self.scale = key_dim**-0.5
         self.key_dim = key_dim
@@ -329,11 +331,11 @@ def __init__(
         self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
         self.resolution_out = resolution_out
         # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
-        self.keys_values = MLPLayerWithBN(input_dim, self.out_dim_keys_values)
-        self.queries_subsample = LevitSubsample(stride, resolution_in)
-        self.queries = MLPLayerWithBN(input_dim, key_dim * num_attention_heads)
-        self.activation = nn.Hardswish()
-        self.projection = MLPLayerWithBN(self.out_dim_projection, output_dim)
+        self.keys_values = TFMLPLayerWithBN(input_dim, self.out_dim_keys_values, name="keys_values")
+        self.queries_subsample = TFLevitSubsample(stride, resolution_in, name="queries_subsample")
+        self.queries = TFMLPLayerWithBN(input_dim, key_dim * num_attention_heads, name="queries")
+        self.activation = hard_swish
+        self.projection = TFMLPLayerWithBN(self.out_dim_projection, output_dim, name="projection")
 
         self.attention_bias_cache = {}
 
@@ -349,78 +351,97 @@ def __init__(
                     attention_offsets[offset] = len(attention_offsets)
                 indices.append(attention_offsets[offset])
 
-        self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets)))
-        self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points))
+        self.attention_offsets = attention_offsets
+
+    def build(self, input_shape):
+        self.attention_biases = self.add_weight(
+            shape=(self.num_attention_heads, len(self.attention_offsets)),
+            initializer="zeros",
+            trainable=True,
+            name="attention_biases",
+        )
+        super().build(input_shape)
 
+    # TODO @ariG23498
     @torch.no_grad()
     def train(self, mode=True):
         super().train(mode)
         if mode and self.attention_bias_cache:
             self.attention_bias_cache = {}  # clear ab cache
 
-    def get_attention_biases(self, device):
-        if self.training:
-            return self.attention_biases[:, self.attention_bias_idxs]
+    def get_attention_biases(self, device, attention_bias_idxs, training=None):
+        if training:
+            return self.attention_biases[:, attention_bias_idxs]
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
                 self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
             return self.attention_bias_cache[device_key]
 
-    def forward(self, hidden_state):
-        batch_size, seq_length, _ = hidden_state.shape
-        key, value = (
-            self.keys_values(hidden_state)
-            .view(batch_size, seq_length, self.num_attention_heads, -1)
-            .split([self.key_dim, self.attention_ratio * self.key_dim], dim=3)
+    def call(self, hidden_state, attention_bias_idxs, training=None):
+        batch_size = tf.shape(hidden_state)[0]
+        seq_length = tf.shape(hidden_state)[1]
+        
+        # Process the hidden states and reshape it
+        reshaped_hidden_state = tf.reshape(
+            self.keys_values(hidden_state),
+            shape=(batch_size, seq_length, self.num_attention_heads, -1)
+        )
+        # Split the reshaped hidden state into key and value
+        key, value = tf.split(
+            reshaped_hidden_state,
+            num_or_size_splits=[self.key_dim, self.attention_ratio * self.key_dim],
+            axis=3,
         )
-        key = key.permute(0, 2, 1, 3)
-        value = value.permute(0, 2, 1, 3)
+        key = tf.transpose(key, perm=(0, 2, 1, 3))
+        value = tf.transpose(value, perm=(0, 2, 1, 3))
 
         query = self.queries(self.queries_subsample(hidden_state))
-        query = query.view(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim).permute(
-            0, 2, 1, 3
-        )
+        query = tf.reshape(query, shape=(batch_size, self.resolution_out**2, self.num_attention_heads, self.key_dim))
+        query = tf.transpose(query, perm=(0, 2, 1, 3))
 
-        attention = query @ key.transpose(-2, -1) * self.scale + self.get_attention_biases(hidden_state.device)
+        attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases(
+            hidden_state.device, attention_bias_idxs, training=training
+        )
         attention = attention.softmax(dim=-1)
         hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection)
         hidden_state = self.projection(self.activation(hidden_state))
         return hidden_state
 
 
-class LevitMLPLayer(nn.Module):
+class TFLevitMLPLayer(tf.keras.layers.Layer):
     """
     MLP Layer with `2X` expansion in contrast to ViT with `4X`.
     """
 
-    def __init__(self, input_dim, hidden_dim):
-        super().__init__()
-        self.linear_up = MLPLayerWithBN(input_dim, hidden_dim)
-        self.activation = nn.Hardswish()
-        self.linear_down = MLPLayerWithBN(hidden_dim, input_dim)
+    def __init__(self, input_dim, hidden_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim)
+        self.activation = hard_swish
+        self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim)
 
-    def forward(self, hidden_state):
+    def call(self, hidden_state):
         hidden_state = self.linear_up(hidden_state)
         hidden_state = self.activation(hidden_state)
         hidden_state = self.linear_down(hidden_state)
         return hidden_state
 
 
-class LevitResidualLayer(nn.Module):
+class TFLevitResidualLayer(tf.keras.layers.Layer):
     """
-    Residual Block for LeViT
+    Residual Block for TFLeViT
     """
 
-    def __init__(self, module, drop_rate):
-        super().__init__()
+    def __init__(self, module, drop_rate, **kwargs):
+        super().__init__(**kwargs)
         self.module = module
         self.drop_rate = drop_rate
 
-    def forward(self, hidden_state):
-        if self.training and self.drop_rate > 0:
-            rnd = torch.rand(hidden_state.size(0), 1, 1, device=hidden_state.device)
-            rnd = rnd.ge_(self.drop_rate).div(1 - self.drop_rate).detach()
+    def call(self, hidden_state, training=None):
+        if training and self.drop_rate > 0:
+            rnd = tf.random.normal(shape=(tf.shape(hidden_state)[0], 1, 1), minval=0, maxval=1)
+            rnd = tf.math.greater(rnd, self.drop_rate)
+            rnd = tf.math.divide(rnd, (1 - self.drop_rate))
             hidden_state = hidden_state + self.module(hidden_state) * rnd
             return hidden_state
         else:
@@ -428,9 +449,9 @@ def forward(self, hidden_state):
             return hidden_state
 
 
-class LevitStage(nn.Module):
+class TFLevitStage(tf.keras.layers.Layer):
     """
-    LeViT Stage consisting of `LevitMLPLayer` and `LevitAttention` layers.
+    LeViT Stage consisting of `TFLevitMLPLayer` and `TFLevitAttention` layers.
     """
 
     def __init__(
@@ -445,29 +466,32 @@ def __init__(
         mlp_ratio,
         down_ops,
         resolution_in,
+        **kwargs,
     ):
-        super().__init__()
+        super().__init__(**kwargs)
         self.layers = []
         self.config = config
         self.resolution_in = resolution_in
         # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
-        for _ in range(depths):
+        
+        # TODO ariG23498: add the index values to the layer names
+        for idx in range(depths):
             self.layers.append(
-                LevitResidualLayer(
-                    LevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
+                TFLevitResidualLayer(
+                    TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
                     self.config.drop_path_rate,
                 )
             )
             if mlp_ratio > 0:
                 hidden_dim = hidden_sizes * mlp_ratio
                 self.layers.append(
-                    LevitResidualLayer(LevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate)
+                    TFLevitResidualLayer(TFLevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate)
                 )
 
         if down_ops[0] == "Subsample":
             self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
             self.layers.append(
-                LevitAttentionSubsample(
+                TFLevitAttentionSubsample(
                     *self.config.hidden_sizes[idx : idx + 2],
                     key_dim=down_ops[1],
                     num_attention_heads=down_ops[2],
@@ -481,36 +505,35 @@ def __init__(
             if down_ops[4] > 0:
                 hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4]
                 self.layers.append(
-                    LevitResidualLayer(
-                        LevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate
+                    TFLevitResidualLayer(
+                        TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate
                     )
                 )
 
-        self.layers = nn.ModuleList(self.layers)
-
     def get_resolution(self):
         return self.resolution_in
 
-    def forward(self, hidden_state):
+    def call(self, hidden_state):
         for layer in self.layers:
             hidden_state = layer(hidden_state)
         return hidden_state
 
 
-class LevitEncoder(nn.Module):
+class TFLevitEncoder(tf.keras.layers.Layer):
     """
-    LeViT Encoder consisting of multiple `LevitStage` stages.
+    LeViT Encoder consisting of multiple `TFLevitStage` stages.
     """
 
-    def __init__(self, config):
-        super().__init__()
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
         self.config = config
         resolution = self.config.image_size // self.config.patch_size
         self.stages = []
         self.config.down_ops.append([""])
 
+        # TODO ariG23498: add the index values to the layer names
         for stage_idx in range(len(config.depths)):
-            stage = LevitStage(
+            stage = TFLevitStage(
                 config,
                 stage_idx,
                 config.hidden_sizes[stage_idx],
@@ -525,9 +548,7 @@ def __init__(self, config):
             resolution = stage.get_resolution()
             self.stages.append(stage)
 
-        self.stages = nn.ModuleList(self.stages)
-
-    def forward(self, hidden_state, output_hidden_states=False, return_dict=True):
+    def call(self, hidden_state, output_hidden_states=False, return_dict=True, training=None):
         all_hidden_states = () if output_hidden_states else None
 
         for stage in self.stages:
@@ -540,10 +561,10 @@ def forward(self, hidden_state, output_hidden_states=False, return_dict=True):
         if not return_dict:
             return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
 
-        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
+        return TFBaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
 
 
-class LevitClassificationLayer(nn.Module):
+class TFLevitClassificationLayer(tf.keras.layers.Layer):
     """
     LeViT Classification Layer
     """
@@ -559,7 +580,46 @@ def forward(self, hidden_state):
         return logits
 
 
-class LevitPreTrainedModel(PreTrainedModel):
+@keras_serializable
+class TFLeViTMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.config = config
+        self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings")
+        self.encoder = TFLevitEncoder(config, name="encoder")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embeddings = self.patch_embeddings(pixel_values)
+        encoder_outputs = self.encoder(
+            embeddings,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
+        pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+class TFLevitPreTrainedModel(TFPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
     models.
@@ -568,108 +628,161 @@ class LevitPreTrainedModel(PreTrainedModel):
     config_class = LevitConfig
     base_model_prefix = "levit"
     main_input_name = "pixel_values"
-    supports_gradient_checkpointing = True
 
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, (nn.Linear, nn.Conv2d)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, (nn.BatchNorm1d, nn.BatchNorm2d)):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(
+            shape=(3, self.config.num_channels, self.config.image_size, self.config.image_size), dtype=tf.float32
+        )
+        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
 
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, LevitModel):
-            module.gradient_checkpointing = value
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        output = self.call(inputs)
+
+        return self.serving_output(output)
 
 
 LEVIT_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
-    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
     behavior.
 
-    Parameters:
-        config ([`LevitConfig`]): Model configuration class with all the parameters of the model.
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `pixel_values` only and nothing else: `model(pixel_values)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([pixel_values, attention_mask])` or `model([pixel_values, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"pixel_values": pixel_values, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 LEVIT_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
-            [`AutoFeatureExtractor.__call__`] for details.
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
+            [`ViTFeatureExtractor.__call__`] for details.
 
+        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        interpolate_pos_encoding (`bool`, *optional*):
+            Whether to interpolate the pre-trained position encodings.
         return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
 """
 
 
+
 @add_start_docstrings(
     "The bare Levit model outputting raw features without any specific head on top.",
     LEVIT_START_DOCSTRING,
 )
-class LevitModel(LevitPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        self.patch_embeddings = LevitPatchEmbeddings(config)
-        self.encoder = LevitEncoder(config)
-        # Initialize weights and apply final processing
-        self.post_init()
+class TFLevitModel(TFLevitPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        
+        self.levit = TFLevitMainLayer(config=config, name="levit")
 
+    @unpack_inputs
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        output_type=TFBaseModelOutputWithPoolingAndNoAttention,
         config_class=_CONFIG_FOR_DOC,
         modality="vision",
         expected_output=_EXPECTED_OUTPUT_SHAPE,
     )
-    def forward(
+    def call(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: tf.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        embeddings = self.patch_embeddings(pixel_values)
-        encoder_outputs = self.encoder(
-            embeddings,
+        outputs = self.levit(
+            pixel_values=pixel_values,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        last_hidden_state = encoder_outputs[0]
-
-        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
-        pooled_output = last_hidden_state.mean(dim=1)
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
+        return outputs
+    
+    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=output.last_hidden_state,
+            pooler_output=output.pooler_output,
+            hidden_states=hs,
+            attentions=attns,
         )
 
 
+
 @add_start_docstrings(
     """
     Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
@@ -677,50 +790,46 @@ def forward(
     """,
     LEVIT_START_DOCSTRING,
 )
-class LevitForImageClassification(LevitPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
+class TFLevitForImageClassification(TFLevitPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
-        self.levit = LevitModel(config)
+        self.levit = TFLeViTMainLayer(config, name="levit")
 
         # Classifier head
         self.classifier = (
-            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
             if config.num_labels > 0
-            else torch.nn.Identity()
+            else tf.identity
         )
 
-        # Initialize weights and apply final processing
-        self.post_init()
-
+    @unpack_inputs
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=ImageClassifierOutputWithNoAttention,
+        output_type=TFImageClassifierOutputWithNoAttention,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
     )
-    def forward(
+    def call(
         self,
-        pixel_values: torch.FloatTensor = None,
-        labels: Optional[torch.LongTensor] = None,
+        pixel_values: tf.Tensor = None,
+        labels: Optional[tf.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
             Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
 
         sequence_output = outputs[0]
-        sequence_output = sequence_output.mean(1)
+        sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
         logits = self.classifier(sequence_output)
 
         loss = None
@@ -728,28 +837,29 @@ def forward(
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
-
+            # TODO @ariG23498: Check the implementation of the loss fucntions for the 
+            # various problem types
             if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
+                loss_fct = MeanSquaredError()
                 if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                    loss = loss_fct(tf.squeeze(logits), tf.squeeze(labels))
                 else:
                     loss = loss_fct(logits, labels)
             elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+                loss_fct = CategoricalCrossentropy()
+                loss = loss_fct(tf.reshape(logits, shape=(-1, self.num_labels)), tf.flatten(labels))
             elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
+                loss_fct = BinaryCrossentropy()
                 loss = loss_fct(logits, labels)
         if not return_dict:
             output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        return ImageClassifierOutputWithNoAttention(
+        return TFImageClassifierOutputWithNoAttention(
             loss=loss,
             logits=logits,
             hidden_states=outputs.hidden_states,
@@ -765,48 +875,44 @@ def forward(
     """,
     LEVIT_START_DOCSTRING,
 )
-class LevitForImageClassificationWithTeacher(LevitPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
+class TFLevitForImageClassificationWithTeacher(TFLevitPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
-        self.levit = LevitModel(config)
+        self.levit = TFLeViTMainLayer(config, name="levit")
 
         # Classifier head
         self.classifier = (
-            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier")
             if config.num_labels > 0
-            else torch.nn.Identity()
+            else tf.identity
         )
         self.classifier_distill = (
-            LevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier_distill")
             if config.num_labels > 0
-            else torch.nn.Identity()
+            else tf.identity
         )
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
+    
+    @unpack_inputs
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
         processor_class=_FEAT_EXTRACTOR_FOR_DOC,
         checkpoint=_IMAGE_CLASS_CHECKPOINT,
-        output_type=LevitForImageClassificationWithTeacherOutput,
+        output_type=TFLevitForImageClassificationWithTeacherOutput,
         config_class=_CONFIG_FOR_DOC,
         expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
     )
-    def forward(
+    def call(
         self,
-        pixel_values: torch.FloatTensor = None,
+        pixel_values: tf.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ):
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
         outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
 
         sequence_output = outputs[0]
-        sequence_output = sequence_output.mean(1)
+        sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
         cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output)
         logits = (cls_logits + distill_logits) / 2
 
@@ -814,7 +920,7 @@ def forward(
             output = (logits, cls_logits, distill_logits) + outputs[2:]
             return output
 
-        return LevitForImageClassificationWithTeacherOutput(
+        return TFLevitForImageClassificationWithTeacherOutput(
             logits=logits,
             cls_logits=cls_logits,
             distillation_logits=distill_logits,

From 94b35d24d001f400fb9128d62712daadab74e7e2 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Sat, 8 Oct 2022 09:30:52 +0530
Subject: [PATCH 03/11] chore: adding training and other nits to TF

---
 .../models/levit/modeling_tf_levit.py         | 92 +++++++++++--------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 6b1de7a42815..679f804e5d42 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -25,6 +25,7 @@
 from ...modeling_outputs import ModelOutput
 from ...modeling_tf_outputs import (
     TFBaseModelOutputWithNoAttention,
+    TFBaseModelOutputWithPooling,
     TFBaseModelOutputWithPoolingAndNoAttention,
     TFImageClassifierOutputWithNoAttention,
 )
@@ -416,9 +417,9 @@ class TFLevitMLPLayer(tf.keras.layers.Layer):
 
     def __init__(self, input_dim, hidden_dim, **kwargs):
         super().__init__(**kwargs)
-        self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim)
+        self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim, name="linear_up")
         self.activation = hard_swish
-        self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim)
+        self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down")
 
     def call(self, hidden_state):
         hidden_state = self.linear_up(hidden_state)
@@ -474,18 +475,22 @@ def __init__(
         self.resolution_in = resolution_in
         # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
         
-        # TODO ariG23498: add the index values to the layer names
         for idx in range(depths):
             self.layers.append(
                 TFLevitResidualLayer(
                     TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
                     self.config.drop_path_rate,
+                    name=f"layers.{idx}",
                 )
             )
             if mlp_ratio > 0:
                 hidden_dim = hidden_sizes * mlp_ratio
                 self.layers.append(
-                    TFLevitResidualLayer(TFLevitMLPLayer(hidden_sizes, hidden_dim), self.config.drop_path_rate)
+                    TFLevitResidualLayer(
+                        TFLevitMLPLayer(hidden_sizes, hidden_dim),
+                        self.config.drop_path_rate,
+                        name=f"layers.{idx}",
+                    )
                 )
 
         if down_ops[0] == "Subsample":
@@ -499,6 +504,7 @@ def __init__(
                     stride=down_ops[5],
                     resolution_in=resolution_in,
                     resolution_out=self.resolution_out,
+                    name=f"layers.{idx}",
                 )
             )
             self.resolution_in = self.resolution_out
@@ -506,7 +512,9 @@ def __init__(
                 hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4]
                 self.layers.append(
                     TFLevitResidualLayer(
-                        TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim), self.config.drop_path_rate
+                        TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim),
+                        self.config.drop_path_rate,
+                        name=f"layers.{idx}",
                     )
                 )
 
@@ -544,6 +552,7 @@ def __init__(self, config, **kwargs):
                 config.mlp_ratio[stage_idx],
                 config.down_ops[stage_idx],
                 resolution,
+                name=f"stages.{stage_idx}"
             )
             resolution = stage.get_resolution()
             self.stages.append(stage)
@@ -571,17 +580,19 @@ class TFLevitClassificationLayer(tf.keras.layers.Layer):
 
     def __init__(self, input_dim, output_dim):
         super().__init__()
-        self.batch_norm = nn.BatchNorm1d(input_dim)
-        self.linear = nn.Linear(input_dim, output_dim)
 
-    def forward(self, hidden_state):
+        # The epsilon and momentum used here are the defaults in torch batch norm layer.
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+        self.linear = tf.keras.layers.Dense(units=output_dim, name="linear")
+
+    def call(self, hidden_state, training=None):
         hidden_state = self.batch_norm(hidden_state)
         logits = self.linear(hidden_state)
         return logits
 
 
 @keras_serializable
-class TFLeViTMainLayer(tf.keras.layers.Layer):
+class TFLevitMainLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
         self.config = config
@@ -594,15 +605,17 @@ def call(
         pixel_values: tf.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
     ):
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
-        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.patch_embeddings(pixel_values, training=training)
         encoder_outputs = self.encoder(
             embeddings,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            training=training,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -699,36 +712,21 @@ def serving(self, inputs):
     </Tip>
 
     Args:
-        config ([`ViTConfig`]): Model configuration class with all the parameters of the model.
+        config ([`LevitConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 LEVIT_INPUTS_DOCSTRING = r"""
     Args:
-        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
-            Pixel values. Pixel values can be obtained using [`ViTFeatureExtractor`]. See
-            [`ViTFeatureExtractor.__call__`] for details.
-
-        head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
-            config will be used instead.
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
-            used instead.
-        interpolate_pos_encoding (`bool`, *optional*):
-            Whether to interpolate the pre-trained position encodings.
+            more detail.
         return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
-            eager mode, in graph mode the value will always be set to True.
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         training (`bool`, *optional*, defaults to `False``):
             Whether or not to use the model in training mode (some modules like dropout modules have different
             behaviors between training and evaluation).
@@ -761,16 +759,19 @@ def call(
         pixel_values: tf.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
     ):
         outputs = self.levit(
             pixel_values=pixel_values,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            training=training,
         )
 
         return outputs
     
-    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+    # TODO @ariG23498: Check the output type for serving.
+    def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling:
         hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
         attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
 
@@ -795,11 +796,11 @@ def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
-        self.levit = TFLeViTMainLayer(config, name="levit")
+        self.levit = TFLevitMainLayer(config, name="levit")
 
         # Classifier head
         self.classifier = (
-            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels)
+            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier")
             if config.num_labels > 0
             else tf.identity
         )
@@ -819,6 +820,7 @@ def call(
         labels: Optional[tf.Tensor] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
     ):
         r"""
         labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
@@ -826,7 +828,12 @@ def call(
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        outputs = self.levit(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
 
         sequence_output = outputs[0]
         sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
@@ -841,8 +848,7 @@ def call(
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
-            # TODO @ariG23498: Check the implementation of the loss fucntions for the 
-            # various problem types
+
             if self.config.problem_type == "regression":
                 loss_fct = MeanSquaredError()
                 if self.num_labels == 1:
@@ -880,7 +886,7 @@ def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
-        self.levit = TFLeViTMainLayer(config, name="levit")
+        self.levit = TFLevitMainLayer(config, name="levit")
 
         # Classifier head
         self.classifier = (
@@ -908,8 +914,14 @@ def call(
         pixel_values: tf.Tensor = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
     ):
-        outputs = self.levit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+        outputs = self.levit(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
 
         sequence_output = outputs[0]
         sequence_output = tf.math.reduce_mean(sequence_output, axis=1)

From 793385dca8fc0a99f4890e019941f63671285b22 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Sat, 8 Oct 2022 14:33:16 +0530
Subject: [PATCH 04/11] chore: adding non trainable variables and training flag

---
 .../models/levit/modeling_tf_levit.py         | 196 +++++++++++-------
 1 file changed, 118 insertions(+), 78 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 679f804e5d42..b4277c069c62 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -17,6 +17,7 @@
 import itertools
 from dataclasses import dataclass
 from typing import Optional, Tuple, Dict
+from numpy import indices
 
 import tensorflow as tf
 from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy
@@ -87,16 +88,7 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation=1,
-        groups=1,
-        bn_weight_init=1,
-        **kwargs,
+        self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, **kwargs,
     ):
         super().__init__(**kwargs)
         self.convolution = tf.keras.layers.Conv2D(
@@ -107,15 +99,15 @@ def __init__(
             dilation_rate=dilation,
             groups=groups,
             bias=False,
-            data_format="channels_first",  # required for tf
+            data_format="channels_first",
             name="convolution",
         )
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, embeddings):
-        embeddings = self.convolution(embeddings)
-        embeddings = self.batch_norm(embeddings)
+    def call(self, embeddings, training=None):
+        embeddings = self.convolution(embeddings, training=training)
+        embeddings = self.batch_norm(embeddings, training=training)
         return embeddings
 
 
@@ -133,59 +125,61 @@ class TFLevitPatchEmbeddings(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super().__init__(**kwargs)
         self.embedding_layer_1 = TFLevitConvEmbeddings(
-            config.num_channels,
-            config.hidden_sizes[0] // 8,
-            config.kernel_size,
-            config.stride,
-            config.padding,
+            in_channels=config.num_channels,
+            out_channels=config.hidden_sizes[0] // 8,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            padding=config.padding,
             name="embedding_layer_1",
         )
         self.activation_layer_1 = hard_swish
 
         self.embedding_layer_2 = TFLevitConvEmbeddings(
-            config.hidden_sizes[0] // 8,
-            config.hidden_sizes[0] // 4,
-            config.kernel_size,
-            config.stride,
-            config.padding,
+            in_channels=config.hidden_sizes[0] // 8,
+            out_channels=config.hidden_sizes[0] // 4,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            padding=config.padding,
             name="embedding_layer_2",
         )
         self.activation_layer_2 = hard_swish
 
         self.embedding_layer_3 = TFLevitConvEmbeddings(
-            config.hidden_sizes[0] // 4,
-            config.hidden_sizes[0] // 2,
-            config.kernel_size,
-            config.stride,
-            config.padding,
+            in_channels=config.hidden_sizes[0] // 4,
+            out_channels=config.hidden_sizes[0] // 2,
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            padding=config.padding,
             name="embedding_layer_3",
         )
         self.activation_layer_3 = hard_swish
 
         self.embedding_layer_4 = TFLevitConvEmbeddings(
-            config.hidden_sizes[0] // 2,
-            config.hidden_sizes[0],
-            config.kernel_size,
-            config.stride,
-            config.padding,
+            in_channels=config.hidden_sizes[0] // 2,
+            out_channels=config.hidden_sizes[0],
+            kernel_size=config.kernel_size,
+            stride=config.stride,
+            padding=config.padding,
             name="embedding_layer_4",
         )
         self.num_channels = config.num_channels
 
-    def call(self, pixel_values):
+    def call(self, pixel_values, training=None):
         batch_size = tf.shape(pixel_values)[0]
         num_channels = tf.shape(pixel_values)[1]
+        
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        embeddings = self.embedding_layer_1(pixel_values)
+        
+        embeddings = self.embedding_layer_1(pixel_values, training=training)
         embeddings = self.activation_layer_1(embeddings)
-        embeddings = self.embedding_layer_2(embeddings)
+        embeddings = self.embedding_layer_2(embeddings, training=training)
         embeddings = self.activation_layer_2(embeddings)
-        embeddings = self.embedding_layer_3(embeddings)
+        embeddings = self.embedding_layer_3(embeddings, training=training)
         embeddings = self.activation_layer_3(embeddings)
-        embeddings = self.embedding_layer_4(embeddings)
+        embeddings = self.embedding_layer_4(embeddings, training=training)
         # Flatten the embeddings
         flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1))
         # Transpose the channel and spatial axis of the flattened embeddings
@@ -200,19 +194,24 @@ def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs):
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, hidden_state):
+    def call(self, hidden_state, training=None):
         num_channels = tf.shape(hidden_state)[2]
-        hidden_state = self.linear(hidden_state)
+        hidden_state = self.linear(hidden_state, training=training)
+        
         # Before sending the hidden state to the batch normalization layer, we would have to
         # flatten the hidden states in the batch and seq len dimension
         flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels))
-        batch_norm_hidden_state = self.batch_norm(flattened_hidden_state)
+        batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training)
+        
         # Reshape the output of batch norm to have the same shape as the original hidden state
         hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state))
         return hidden_state
 
 
 class TFLevitSubsample(tf.keras.layers.Layer):
+    """
+    Layer to subsample the activatioin maps
+    """
     def __init__(self, stride, resolution, **kwargs):
         super().__init__()
         self.stride = stride
@@ -221,11 +220,13 @@ def __init__(self, stride, resolution, **kwargs):
     def call(self, hidden_state):
         batch_size = tf.shape(hidden_state)[0]
         channels = tf.shape(hidden_state)[2]
+        
         reshaped_hidden_state = tf.reshape(
             hidden_state, shape=(batch_size, self.resolution, self.resolution, channels)
         )
         strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride]
         hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels))
+        
         return hidden_state
 
 
@@ -243,15 +244,23 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
         self.activation = hard_swish
         self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection")
 
+        # Build tuples of points in the entire resolution range of the pixel values
         points = list(itertools.product(range(resolution), range(resolution)))
-        len_points = len(points)
+        self.len_points = len(points)
+
+        # Initialize the attention offsets and indices
         attention_offsets, indices = {}, []
-        for p1 in points:
-            for p2 in points:
+
+        # Iterate over the points generator and calculate the offset between the initial
+        # point (0, 0) and the rest of the points [(0, 1), (0, 2)...]
+        for p1 in points: # this iterates only once
+            for p2 in points: # iterate over all the points other than (0, 0)
                 offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
                 if offset not in attention_offsets:
                     attention_offsets[offset] = len(attention_offsets)
                 indices.append(attention_offsets[offset])
+        
+        # Store the attention offsets, indices and attention bias cache
         self.attention_offsets = attention_offsets
         self.indices = indices
         self.attention_bias_cache = {}
@@ -263,6 +272,12 @@ def build(self, input_shape):
             trainable=True,
             name="attention_biases",
         )
+        self.attention_bias_idxs = tf.Variable(
+            initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)),
+            trainable=False, # this is a registered buffer and not a parameter
+            dtype=tf.float32,
+            name="attention_bias_idxs",
+        )
         super().build(input_shape)
 
     # TODO @ariG23498
@@ -272,16 +287,16 @@ def train(self, mode=True):
         if mode and self.attention_bias_cache:
             self.attention_bias_cache = {}  # clear ab cache
 
-    def get_attention_biases(self, device, attention_bias_idxs, training=None):
+    def get_attention_biases(self, device, training=None):
         if training:
-            return self.attention_biases[:, attention_bias_idxs]
+            return self.attention_biases[:, self.attention_bias_idxs]
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
                 self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
             return self.attention_bias_cache[device_key]
 
-    def call(self, hidden_state, attention_bias_idxs, training=None):
+    def call(self, hidden_state, training=None):
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
         queries_keys_values = self.queries_keys_values(hidden_state)
@@ -300,7 +315,7 @@ def call(self, hidden_state, attention_bias_idxs, training=None):
         value = tf.transpose(value, perm=(0, 2, 1, 3))
 
         attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases(
-            hidden_state.device, attention_bias_idxs, training=training
+            hidden_state.device, training=training
         )
         attention = stable_softmax(attention, axis=-1)
         hidden_state = tf.matmul(attention, value)
@@ -342,7 +357,7 @@ def __init__(
 
         points = list(itertools.product(range(resolution_in), range(resolution_in)))
         points_ = list(itertools.product(range(resolution_out), range(resolution_out)))
-        len_points, len_points_ = len(points), len(points_)
+        self.len_points, self.len_points_ = len(points), len(points_)
         attention_offsets, indices = {}, []
         for p1 in points_:
             for p2 in points:
@@ -353,6 +368,7 @@ def __init__(
                 indices.append(attention_offsets[offset])
 
         self.attention_offsets = attention_offsets
+        self.indices = indices
 
     def build(self, input_shape):
         self.attention_biases = self.add_weight(
@@ -361,6 +377,13 @@ def build(self, input_shape):
             trainable=True,
             name="attention_biases",
         )
+
+        self.attention_bias_idxs = tf.Variable(
+            initial_value=tf.reshape(self.indices, (self.len_points_, self.len_points)),
+            trainable=False,
+            dtype=tf.float32,
+            name="attention_bias_idxs",
+        )
         super().build(input_shape)
 
     # TODO @ariG23498
@@ -370,23 +393,22 @@ def train(self, mode=True):
         if mode and self.attention_bias_cache:
             self.attention_bias_cache = {}  # clear ab cache
 
-    def get_attention_biases(self, device, attention_bias_idxs, training=None):
+    def get_attention_biases(self, device, training=None):
         if training:
-            return self.attention_biases[:, attention_bias_idxs]
+            return self.attention_biases[:, self.attention_bias_idxs]
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
                 self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
             return self.attention_bias_cache[device_key]
 
-    def call(self, hidden_state, attention_bias_idxs, training=None):
+    def call(self, hidden_state, training=None):
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
-        
+
         # Process the hidden states and reshape it
         reshaped_hidden_state = tf.reshape(
-            self.keys_values(hidden_state),
-            shape=(batch_size, seq_length, self.num_attention_heads, -1)
+            self.keys_values(hidden_state), shape=(batch_size, seq_length, self.num_attention_heads, -1)
         )
         # Split the reshaped hidden state into key and value
         key, value = tf.split(
@@ -402,11 +424,13 @@ def call(self, hidden_state, attention_bias_idxs, training=None):
         query = tf.transpose(query, perm=(0, 2, 1, 3))
 
         attention = tf.matmul(query, key, transpose_b=True) * self.scale + self.get_attention_biases(
-            hidden_state.device, attention_bias_idxs, training=training
+            hidden_state.device, training=training
         )
-        attention = attention.softmax(dim=-1)
-        hidden_state = (attention @ value).transpose(1, 2).reshape(batch_size, -1, self.out_dim_projection)
-        hidden_state = self.projection(self.activation(hidden_state))
+        attention = stable_softmax(attention, axis=-1)
+        hidden_state = tf.matmul(attention, value)
+        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
+        hidden_state = tf.reshape(hidden_state, (batch_size, -1, self.out_dim_projection))
+        hidden_state = self.projection(self.activation(hidden_state), training=training)
         return hidden_state
 
 
@@ -474,7 +498,7 @@ def __init__(
         self.config = config
         self.resolution_in = resolution_in
         # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
-        
+
         for idx in range(depths):
             self.layers.append(
                 TFLevitResidualLayer(
@@ -552,7 +576,7 @@ def __init__(self, config, **kwargs):
                 config.mlp_ratio[stage_idx],
                 config.down_ops[stage_idx],
                 resolution,
-                name=f"stages.{stage_idx}"
+                name=f"stages.{stage_idx}",
             )
             resolution = stage.get_resolution()
             self.stages.append(stage)
@@ -583,11 +607,11 @@ def __init__(self, input_dim, output_dim):
 
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
-        self.linear = tf.keras.layers.Dense(units=output_dim, name="linear")
+        self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear")
 
     def call(self, hidden_state, training=None):
-        hidden_state = self.batch_norm(hidden_state)
-        logits = self.linear(hidden_state)
+        hidden_state = self.batch_norm(hidden_state, training=training)
+        logits = self.linear(hidden_state, training=training)
         return logits
 
 
@@ -610,7 +634,10 @@ def call(
         if pixel_values is None:
             raise ValueError("You have to specify pixel_values")
 
+        # Apply patch embeddings to the pixel values
         embeddings = self.patch_embeddings(pixel_values, training=training)
+
+        # Apply encoder to the encoded pixel values
         encoder_outputs = self.encoder(
             embeddings,
             output_hidden_states=output_hidden_states,
@@ -618,7 +645,8 @@ def call(
             training=training,
         )
 
-        last_hidden_state = encoder_outputs[0]
+        # Obtain the `last_hidden_state`
+        last_hidden_state = encoder_outputs[0]  # encoder_outputs.last_hidden_state
 
         # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
         pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
@@ -629,9 +657,10 @@ def call(
         return TFBaseModelOutputWithPoolingAndNoAttention(
             last_hidden_state=last_hidden_state,
             pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
+            hidden_states=encoder_outputs.hidden_states,  # only if the `output_hidden_states` is set to True
         )
 
+
 class TFLevitPreTrainedModel(TFPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -733,7 +762,6 @@ def serving(self, inputs):
 """
 
 
-
 @add_start_docstrings(
     "The bare Levit model outputting raw features without any specific head on top.",
     LEVIT_START_DOCSTRING,
@@ -741,7 +769,7 @@ def serving(self, inputs):
 class TFLevitModel(TFLevitPreTrainedModel):
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
-        
+
         self.levit = TFLevitMainLayer(config=config, name="levit")
 
     @unpack_inputs
@@ -769,7 +797,7 @@ def call(
         )
 
         return outputs
-    
+
     # TODO @ariG23498: Check the output type for serving.
     def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling:
         hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
@@ -783,7 +811,6 @@ def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) ->
         )
 
 
-
 @add_start_docstrings(
     """
     Levit Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
@@ -828,6 +855,7 @@ def call(
             config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
             `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
+        # Get the outputs from the levit main layer
         outputs = self.levit(
             pixel_values=pixel_values,
             output_hidden_states=output_hidden_states,
@@ -835,15 +863,19 @@ def call(
             training=training,
         )
 
-        sequence_output = outputs[0]
+        # Get the `last_hidden_state` and average it along the number of sequences
+        sequence_output = outputs[0]  # outputs.last_hidden_state
         sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
-        logits = self.classifier(sequence_output)
+
+        # Apply the classifier head and obtain the logits
+        logits = self.classifier(sequence_output, training=training)
 
         loss = None
         if labels is not None:
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
+                # TODO @ariG23498: Check with the dtypes (long and int in torch)
                 elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64):
                     self.config.problem_type = "single_label_classification"
                 else:
@@ -868,7 +900,7 @@ def call(
         return TFImageClassifierOutputWithNoAttention(
             loss=loss,
             logits=logits,
-            hidden_states=outputs.hidden_states,
+            hidden_states=outputs.hidden_states,  # only if `output_hidden_states` flag is set to True
         )
 
 
@@ -899,7 +931,7 @@ def __init__(self, config, **kwargs):
             if config.num_labels > 0
             else tf.identity
         )
-    
+
     @unpack_inputs
     @add_start_docstrings_to_model_forward(LEVIT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(
@@ -916,6 +948,7 @@ def call(
         return_dict: Optional[bool] = None,
         training: Optional[bool] = None,
     ):
+        # Get the output from the levit main layer
         outputs = self.levit(
             pixel_values=pixel_values,
             output_hidden_states=output_hidden_states,
@@ -923,9 +956,16 @@ def call(
             training=training,
         )
 
-        sequence_output = outputs[0]
+        # Get the `last_hidden_state` and average it along the number of sequences
+        sequence_output = outputs[0]  # outputs.last_hidden_state
         sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
-        cls_logits, distill_logits = self.classifier(sequence_output), self.classifier_distill(sequence_output)
+
+        # Apply the classifier heads and obtain the `cls_logits` and `distill_logits`
+        cls_logits, distill_logits = self.classifier(sequence_output, training=training), self.classifier_distill(
+            sequence_output, training=training
+        )
+
+        # According to the paper, the cls and distill logits are averaged
         logits = (cls_logits + distill_logits) / 2
 
         if not return_dict:
@@ -936,5 +976,5 @@ def call(
             logits=logits,
             cls_logits=cls_logits,
             distillation_logits=distill_logits,
-            hidden_states=outputs.hidden_states,
+            hidden_states=outputs.hidden_states,  # only if `output_hidden_states` flag is set to True
         )

From 7982dea98140035755eef15bc999aecca3ff2a99 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Sat, 8 Oct 2022 15:05:29 +0530
Subject: [PATCH 05/11] chore: adapting till TFLevitStage

---
 .../models/levit/modeling_tf_levit.py         | 48 +++++++++++--------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index b4277c069c62..200114dfac5b 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -95,10 +95,10 @@ def __init__(
             filters=out_channels,
             kernel_size=kernel_size,
             strides=stride,
-            padding=padding,
+            padding=(padding, padding), # TODO @ariG23498: Make sure the padding is a tuple
             dilation_rate=dilation,
             groups=groups,
-            bias=False,
+            use_bias=False,
             data_format="channels_first",
             name="convolution",
         )
@@ -190,7 +190,7 @@ def call(self, pixel_values, training=None):
 class TFMLPLayerWithBN(tf.keras.layers.Layer):
     def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs):
         super().__init__(**kwargs)
-        self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear")
+        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
@@ -280,12 +280,12 @@ def build(self, input_shape):
         )
         super().build(input_shape)
 
-    # TODO @ariG23498
-    @torch.no_grad()
-    def train(self, mode=True):
-        super().train(mode)
-        if mode and self.attention_bias_cache:
-            self.attention_bias_cache = {}  # clear ab cache
+    # # TODO @ariG23498
+    # @torch.no_grad()
+    # def train(self, mode=True):
+    #     super().train(mode)
+    #     if mode and self.attention_bias_cache:
+    #         self.attention_bias_cache = {}  # clear ab cache
 
     def get_attention_biases(self, device, training=None):
         if training:
@@ -386,12 +386,12 @@ def build(self, input_shape):
         )
         super().build(input_shape)
 
-    # TODO @ariG23498
-    @torch.no_grad()
-    def train(self, mode=True):
-        super().train(mode)
-        if mode and self.attention_bias_cache:
-            self.attention_bias_cache = {}  # clear ab cache
+    # # TODO @ariG23498
+    # @torch.no_grad()
+    # def train(self, mode=True):
+    #     super().train(mode)
+    #     if mode and self.attention_bias_cache:
+    #         self.attention_bias_cache = {}  # clear ab cache
 
     def get_attention_biases(self, device, training=None):
         if training:
@@ -445,10 +445,10 @@ def __init__(self, input_dim, hidden_dim, **kwargs):
         self.activation = hard_swish
         self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down")
 
-    def call(self, hidden_state):
-        hidden_state = self.linear_up(hidden_state)
+    def call(self, hidden_state, training=None):
+        hidden_state = self.linear_up(hidden_state, training=training)
         hidden_state = self.activation(hidden_state)
-        hidden_state = self.linear_down(hidden_state)
+        hidden_state = self.linear_down(hidden_state, training=training)
         return hidden_state
 
 
@@ -518,10 +518,14 @@ def __init__(
                 )
 
         if down_ops[0] == "Subsample":
+
+            print("info", self.config.hidden_sizes)
+            print("info", idx)
             self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
             self.layers.append(
                 TFLevitAttentionSubsample(
-                    *self.config.hidden_sizes[idx : idx + 2],
+                    input_dim=self.config.hidden_sizes[idx],
+                    output_dim=self.config.hidden_sizes[idx + 1],
                     key_dim=down_ops[1],
                     num_attention_heads=down_ops[2],
                     attention_ratio=down_ops[3],
@@ -607,7 +611,7 @@ def __init__(self, input_dim, output_dim):
 
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
-        self.linear = tf.keras.layers.Dense(units=output_dim, bias=False, name="linear")
+        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
 
     def call(self, hidden_state, training=None):
         hidden_state = self.batch_norm(hidden_state, training=training)
@@ -617,8 +621,10 @@ def call(self, hidden_state, training=None):
 
 @keras_serializable
 class TFLevitMainLayer(tf.keras.layers.Layer):
+    config_class = LevitConfig
+
     def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
+        super().__init__(**kwargs)
         self.config = config
         self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings")
         self.encoder = TFLevitEncoder(config, name="encoder")

From 57f5f74dc3e7036a1137e92ca6afaf4b330a85eb Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Sun, 9 Oct 2022 12:02:03 +0530
Subject: [PATCH 06/11] chore: aligning till attention biases

---
 .../models/levit/modeling_tf_levit.py         | 33 +++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 200114dfac5b..77820d7388ea 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -16,12 +16,11 @@
 
 import itertools
 from dataclasses import dataclass
-from typing import Optional, Tuple, Dict
-from numpy import indices
+from typing import Dict, Optional, Tuple
 
 import tensorflow as tf
-from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy
 from tensorflow.keras import backend as K
+from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy, MeanSquaredError
 
 from ...modeling_outputs import ModelOutput
 from ...modeling_tf_outputs import (
@@ -59,7 +58,7 @@
 @dataclass
 class TFLevitForImageClassificationWithTeacherOutput(ModelOutput):
     """
-    Output type of [`LevitForImageClassificationWithTeacher`].
+    Output type of [`TFLevitForImageClassificationWithTeacher`].
 
     Args:
         logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
@@ -95,18 +94,20 @@ def __init__(
             filters=out_channels,
             kernel_size=kernel_size,
             strides=stride,
-            padding=(padding, padding), # TODO @ariG23498: Make sure the padding is a tuple
+            padding="SAME", # TODO @ariG23498: Make sure the padding is a tuple
             dilation_rate=dilation,
             groups=groups,
             use_bias=False,
-            data_format="channels_first",
+            data_format="channels_last",
             name="convolution",
         )
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
     def call(self, embeddings, training=None):
+        embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1))
         embeddings = self.convolution(embeddings, training=training)
+        embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2))
         embeddings = self.batch_norm(embeddings, training=training)
         return embeddings
 
@@ -181,6 +182,7 @@ def call(self, pixel_values, training=None):
         embeddings = self.activation_layer_3(embeddings)
         embeddings = self.embedding_layer_4(embeddings, training=training)
         # Flatten the embeddings
+        num_channels = tf.shape(embeddings)[1]
         flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1))
         # Transpose the channel and spatial axis of the flattened embeddings
         transpose_embeddings = tf.transpose(flattended_embeddings, perm=(0, 2, 1))
@@ -275,7 +277,7 @@ def build(self, input_shape):
         self.attention_bias_idxs = tf.Variable(
             initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)),
             trainable=False, # this is a registered buffer and not a parameter
-            dtype=tf.float32,
+            dtype=tf.int32,
             name="attention_bias_idxs",
         )
         super().build(input_shape)
@@ -293,6 +295,8 @@ def get_attention_biases(self, device, training=None):
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
+                print("INFO biases cache", self.attention_biases.shape)
+                print("INFO biases index", self.attention_bias_idxs.shape)
                 self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
             return self.attention_bias_cache[device_key]
 
@@ -381,7 +385,7 @@ def build(self, input_shape):
         self.attention_bias_idxs = tf.Variable(
             initial_value=tf.reshape(self.indices, (self.len_points_, self.len_points)),
             trainable=False,
-            dtype=tf.float32,
+            dtype=tf.int32,
             name="attention_bias_idxs",
         )
         super().build(input_shape)
@@ -498,13 +502,12 @@ def __init__(
         self.config = config
         self.resolution_in = resolution_in
         # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
-
-        for idx in range(depths):
+        for index in range(depths):
             self.layers.append(
                 TFLevitResidualLayer(
                     TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
                     self.config.drop_path_rate,
-                    name=f"layers.{idx}",
+                    name=f"layers.{index}",
                 )
             )
             if mlp_ratio > 0:
@@ -513,19 +516,15 @@ def __init__(
                     TFLevitResidualLayer(
                         TFLevitMLPLayer(hidden_sizes, hidden_dim),
                         self.config.drop_path_rate,
-                        name=f"layers.{idx}",
+                        name=f"layers.{index}",
                     )
                 )
 
         if down_ops[0] == "Subsample":
-
-            print("info", self.config.hidden_sizes)
-            print("info", idx)
             self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
             self.layers.append(
                 TFLevitAttentionSubsample(
-                    input_dim=self.config.hidden_sizes[idx],
-                    output_dim=self.config.hidden_sizes[idx + 1],
+                    *self.config.hidden_sizes[idx : idx + 2],
                     key_dim=down_ops[1],
                     num_attention_heads=down_ops[2],
                     attention_ratio=down_ops[3],

From fc816813f1a200da519dc9f2dab077a7c8317494 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Mon, 21 Nov 2022 21:19:07 +0530
Subject: [PATCH 07/11] chore: adding padding before conv in
 TFLevitConvEmbeddings

---
 src/transformers/models/levit/modeling_tf_levit.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 77820d7388ea..9abf8aebd7df 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -87,14 +87,15 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, **kwargs,
+        self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, *args, **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(*args, **kwargs)
+        # The padding layer is built in order to pad the inputs before entering the convolution operation.
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=padding)
         self.convolution = tf.keras.layers.Conv2D(
             filters=out_channels,
             kernel_size=kernel_size,
             strides=stride,
-            padding="SAME", # TODO @ariG23498: Make sure the padding is a tuple
             dilation_rate=dilation,
             groups=groups,
             use_bias=False,
@@ -104,8 +105,9 @@ def __init__(
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, embeddings, training=None):
+    def call(self, embeddings: tf.Tensor, training: Optional[bool]=None):
         embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1))
+        embeddings = self.padding(embeddings)
         embeddings = self.convolution(embeddings, training=training)
         embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2))
         embeddings = self.batch_norm(embeddings, training=training)

From 876294a6dc59c6c85d3fbb6317defe1f74590a2a Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Mon, 21 Nov 2022 22:09:28 +0530
Subject: [PATCH 08/11] chore: modification to the reshape operation in
 TFMLPLayerWithBN

---
 .../models/levit/modeling_tf_levit.py         | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 9abf8aebd7df..f2f12cf4dbb6 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -125,8 +125,8 @@ class TFLevitPatchEmbeddings(tf.keras.layers.Layer):
     `TFLevitConvEmbeddings`.
     """
 
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.embedding_layer_1 = TFLevitConvEmbeddings(
             in_channels=config.num_channels,
             out_channels=config.hidden_sizes[0] // 8,
@@ -167,7 +167,7 @@ def __init__(self, config, **kwargs):
         )
         self.num_channels = config.num_channels
 
-    def call(self, pixel_values, training=None):
+    def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None):
         batch_size = tf.shape(pixel_values)[0]
         num_channels = tf.shape(pixel_values)[1]
         
@@ -183,6 +183,7 @@ def call(self, pixel_values, training=None):
         embeddings = self.embedding_layer_3(embeddings, training=training)
         embeddings = self.activation_layer_3(embeddings)
         embeddings = self.embedding_layer_4(embeddings, training=training)
+        
         # Flatten the embeddings
         num_channels = tf.shape(embeddings)[1]
         flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1))
@@ -192,23 +193,28 @@ def call(self, pixel_values, training=None):
 
 
 class TFMLPLayerWithBN(tf.keras.layers.Layer):
-    def __init__(self, input_dim, output_dim, bn_weight_init=1, **kwargs):
-        super().__init__(**kwargs)
-        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
+    def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.linear = tf.keras.layers.Dense(
+            units=output_dim,
+            use_bias=False,
+            name="linear"
+        )
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, hidden_state, training=None):
-        num_channels = tf.shape(hidden_state)[2]
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool]=None):
         hidden_state = self.linear(hidden_state, training=training)
         
         # Before sending the hidden state to the batch normalization layer, we would have to
-        # flatten the hidden states in the batch and seq len dimension
-        flattened_hidden_state = tf.reshape(hidden_state, shape=(-1, num_channels))
+        # flatten the hidden states with start=0 and end=1.
+        hidden_state_shape_list = shape_list(hidden_state)
+        hidden_state_reshape_list = [hidden_state_shape_list[0] * hidden_state_shape_list[1]] + hidden_state_shape_list[2:]
+        flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list)
         batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training)
         
         # Reshape the output of batch norm to have the same shape as the original hidden state
-        hidden_state = tf.reshape(batch_norm_hidden_state, shape=tf.shape(hidden_state))
+        hidden_state = tf.reshape(batch_norm_hidden_state, shape=shape_list(hidden_state))
         return hidden_state
 
 

From 8bbc04743d731d74576fd98ec79bc7c7c38b4e5f Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Wed, 23 Nov 2022 13:30:56 +0530
Subject: [PATCH 09/11] chore: all the variables of LeViT model are ported in
 TF

---
 .../models/levit/modeling_tf_levit.py         | 380 ++++++++++--------
 1 file changed, 213 insertions(+), 167 deletions(-)

diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index f2f12cf4dbb6..4c8af385b2e5 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -16,7 +16,7 @@
 
 import itertools
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional, Tuple, Union
 
 import tensorflow as tf
 from tensorflow.keras import backend as K
@@ -70,9 +70,9 @@ class token).
             Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
             distillation token).
         hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
-            plus the initial embedding outputs.
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
     """
 
     logits: tf.Tensor = None
@@ -87,7 +87,17 @@ class TFLevitConvEmbeddings(tf.keras.layers.Layer):
     """
 
     def __init__(
-        self, in_channels, out_channels, kernel_size, stride, padding, dilation=1, groups=1, bn_weight_init=1, *args, **kwargs,
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation=1,
+        groups=1,
+        bn_weight_init=1,
+        *args,
+        **kwargs,
     ):
         super().__init__(*args, **kwargs)
         # The padding layer is built in order to pad the inputs before entering the convolution operation.
@@ -105,12 +115,12 @@ def __init__(
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, embeddings: tf.Tensor, training: Optional[bool]=None):
+    def call(self, embeddings: tf.Tensor, training: Optional[bool] = None):
         embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1))
         embeddings = self.padding(embeddings)
         embeddings = self.convolution(embeddings, training=training)
-        embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2))
         embeddings = self.batch_norm(embeddings, training=training)
+        embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2))
         return embeddings
 
 
@@ -167,15 +177,15 @@ def __init__(self, config, *args, **kwargs):
         )
         self.num_channels = config.num_channels
 
-    def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None):
+    def call(self, pixel_values: tf.Tensor, training: Optional[bool] = None):
         batch_size = tf.shape(pixel_values)[0]
         num_channels = tf.shape(pixel_values)[1]
-        
+
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        
+
         embeddings = self.embedding_layer_1(pixel_values, training=training)
         embeddings = self.activation_layer_1(embeddings)
         embeddings = self.embedding_layer_2(embeddings, training=training)
@@ -183,7 +193,7 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None):
         embeddings = self.embedding_layer_3(embeddings, training=training)
         embeddings = self.activation_layer_3(embeddings)
         embeddings = self.embedding_layer_4(embeddings, training=training)
-        
+
         # Flatten the embeddings
         num_channels = tf.shape(embeddings)[1]
         flattended_embeddings = tf.reshape(embeddings, shape=(batch_size, num_channels, -1))
@@ -195,24 +205,22 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool]=None):
 class TFMLPLayerWithBN(tf.keras.layers.Layer):
     def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.linear = tf.keras.layers.Dense(
-            units=output_dim,
-            use_bias=False,
-            name="linear"
-        )
+        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
 
-    def call(self, hidden_state: tf.Tensor, training: Optional[bool]=None):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         hidden_state = self.linear(hidden_state, training=training)
-        
+
         # Before sending the hidden state to the batch normalization layer, we would have to
         # flatten the hidden states with start=0 and end=1.
         hidden_state_shape_list = shape_list(hidden_state)
-        hidden_state_reshape_list = [hidden_state_shape_list[0] * hidden_state_shape_list[1]] + hidden_state_shape_list[2:]
+        hidden_state_reshape_list = [
+            hidden_state_shape_list[0] * hidden_state_shape_list[1]
+        ] + hidden_state_shape_list[2:]
         flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list)
         batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training)
-        
+
         # Reshape the output of batch norm to have the same shape as the original hidden state
         hidden_state = tf.reshape(batch_norm_hidden_state, shape=shape_list(hidden_state))
         return hidden_state
@@ -222,27 +230,28 @@ class TFLevitSubsample(tf.keras.layers.Layer):
     """
     Layer to subsample the activatioin maps
     """
-    def __init__(self, stride, resolution, **kwargs):
-        super().__init__()
+
+    def __init__(self, stride, resolution, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.stride = stride
         self.resolution = resolution
 
-    def call(self, hidden_state):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         batch_size = tf.shape(hidden_state)[0]
         channels = tf.shape(hidden_state)[2]
-        
+
         reshaped_hidden_state = tf.reshape(
             hidden_state, shape=(batch_size, self.resolution, self.resolution, channels)
         )
         strided_hidden_state = reshaped_hidden_state[:, :: self.stride, :: self.stride]
         hidden_state = tf.reshape(strided_hidden_state, shape=(batch_size, -1, channels))
-        
+
         return hidden_state
 
 
 class TFLevitAttention(tf.keras.layers.Layer):
-    def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.num_attention_heads = num_attention_heads
         self.scale = key_dim**-0.5
         self.key_dim = key_dim
@@ -250,9 +259,13 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
         self.out_dim_keys_values = attention_ratio * key_dim * num_attention_heads + key_dim * num_attention_heads * 2
         self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
 
-        self.queries_keys_values = TFMLPLayerWithBN(hidden_sizes, self.out_dim_keys_values, name="queries_keys_values")
+        self.queries_keys_values = TFMLPLayerWithBN(
+            input_dim=hidden_sizes, output_dim=self.out_dim_keys_values, name="queries_keys_values"
+        )
         self.activation = hard_swish
-        self.projection = TFMLPLayerWithBN(self.out_dim_projection, hidden_sizes, bn_weight_init=0, name="projection")
+        self.projection = TFMLPLayerWithBN(
+            input_dim=self.out_dim_projection, output_dim=hidden_sizes, bn_weight_init=0, name="projection"
+        )
 
         # Build tuples of points in the entire resolution range of the pixel values
         points = list(itertools.product(range(resolution), range(resolution)))
@@ -261,21 +274,21 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
         # Initialize the attention offsets and indices
         attention_offsets, indices = {}, []
 
-        # Iterate over the points generator and calculate the offset between the initial
+        # Iterate over the `points`` generator and calculate the offset between the initial
         # point (0, 0) and the rest of the points [(0, 1), (0, 2)...]
-        for p1 in points: # this iterates only once
-            for p2 in points: # iterate over all the points other than (0, 0)
+        for p1 in points:  # this iterates only once, wehre p1 is (0, 0)
+            for p2 in points:  # iterate over all the points other than (0, 0)
                 offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
                 if offset not in attention_offsets:
                     attention_offsets[offset] = len(attention_offsets)
                 indices.append(attention_offsets[offset])
-        
+
         # Store the attention offsets, indices and attention bias cache
         self.attention_offsets = attention_offsets
         self.indices = indices
         self.attention_bias_cache = {}
 
-    def build(self, input_shape):
+    def build(self, input_shape: tf.TensorShape):
         self.attention_biases = self.add_weight(
             shape=(self.num_attention_heads, len(self.attention_offsets)),
             initializer="zeros",
@@ -284,39 +297,38 @@ def build(self, input_shape):
         )
         self.attention_bias_idxs = tf.Variable(
             initial_value=tf.reshape(self.indices, (self.len_points, self.len_points)),
-            trainable=False, # this is a registered buffer and not a parameter
+            trainable=False,  # this is a registered buffer and not a parameter
             dtype=tf.int32,
             name="attention_bias_idxs",
         )
         super().build(input_shape)
 
-    # # TODO @ariG23498
-    # @torch.no_grad()
-    # def train(self, mode=True):
-    #     super().train(mode)
-    #     if mode and self.attention_bias_cache:
-    #         self.attention_bias_cache = {}  # clear ab cache
-
-    def get_attention_biases(self, device, training=None):
+    def get_attention_biases(self, device, training: Optional[bool] = None):
         if training:
-            return self.attention_biases[:, self.attention_bias_idxs]
+            return tf.gather(self.attention_biases, self.attention_bias_idxs, axis=1)
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
-                print("INFO biases cache", self.attention_biases.shape)
-                print("INFO biases index", self.attention_bias_idxs.shape)
-                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+                self.attention_bias_cache[device_key] = tf.gather(
+                    self.attention_biases, self.attention_bias_idxs, axis=1
+                )
             return self.attention_bias_cache[device_key]
 
-    def call(self, hidden_state, training=None):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
+
+        # TODO: figure out the clearing cache mechanism
+        if training and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
         queries_keys_values = self.queries_keys_values(hidden_state)
 
-        # Reshape queries_keys_values
+        # Reshape `queries_keys_values`.
         reshaped_queries_keys_values = tf.reshape(
             queries_keys_values, shape=(batch_size, seq_length, self.num_attention_heads, -1)
         )
+        # Split the reshaped tensor into query, key, and value.
         query, key, value = tf.split(
             value=reshaped_queries_keys_values,
             num_or_size_splits=[self.key_dim, self.key_dim, self.attention_ratio * self.key_dim],
@@ -348,9 +360,10 @@ def __init__(
         stride,
         resolution_in,
         resolution_out,
+        *args,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(*args, **kwargs)
         self.num_attention_heads = num_attention_heads
         self.scale = key_dim**-0.5
         self.key_dim = key_dim
@@ -359,11 +372,13 @@ def __init__(
         self.out_dim_projection = attention_ratio * key_dim * num_attention_heads
         self.resolution_out = resolution_out
         # resolution_in is the intial resolution, resoloution_out is final resolution after downsampling
-        self.keys_values = TFMLPLayerWithBN(input_dim, self.out_dim_keys_values, name="keys_values")
-        self.queries_subsample = TFLevitSubsample(stride, resolution_in, name="queries_subsample")
-        self.queries = TFMLPLayerWithBN(input_dim, key_dim * num_attention_heads, name="queries")
+        self.keys_values = TFMLPLayerWithBN(
+            input_dim=input_dim, output_dim=self.out_dim_keys_values, name="keys_values"
+        )
+        self.queries_subsample = TFLevitSubsample(stride=stride, resolution=resolution_in, name="queries_subsample")
+        self.queries = TFMLPLayerWithBN(input_dim=input_dim, output_dim=key_dim * num_attention_heads, name="queries")
         self.activation = hard_swish
-        self.projection = TFMLPLayerWithBN(self.out_dim_projection, output_dim, name="projection")
+        self.projection = TFMLPLayerWithBN(input_dim=self.out_dim_projection, output_dim=output_dim, name="projection")
 
         self.attention_bias_cache = {}
 
@@ -382,7 +397,7 @@ def __init__(
         self.attention_offsets = attention_offsets
         self.indices = indices
 
-    def build(self, input_shape):
+    def build(self, input_shape: tf.TensorShape):
         self.attention_biases = self.add_weight(
             shape=(self.num_attention_heads, len(self.attention_offsets)),
             initializer="zeros",
@@ -398,23 +413,23 @@ def build(self, input_shape):
         )
         super().build(input_shape)
 
-    # # TODO @ariG23498
-    # @torch.no_grad()
-    # def train(self, mode=True):
-    #     super().train(mode)
-    #     if mode and self.attention_bias_cache:
-    #         self.attention_bias_cache = {}  # clear ab cache
-
-    def get_attention_biases(self, device, training=None):
+    def get_attention_biases(self, device, training: Optional[bool] = None):
         if training:
-            return self.attention_biases[:, self.attention_bias_idxs]
+            return tf.gather(self.attention_biases, self.attention_bias_idxs, axis=1)
         else:
             device_key = str(device)
             if device_key not in self.attention_bias_cache:
-                self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs]
+                self.attention_bias_cache[device_key] = tf.gather(
+                    self.attention_biases, self.attention_bias_idxs, axis=1
+                )
             return self.attention_bias_cache[device_key]
 
-    def call(self, hidden_state, training=None):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
+
+        # TODO: figure out the clearing cache mechanism
+        if training and self.attention_bias_cache:
+            self.attention_bias_cache = {}  # clear ab cache
+
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
 
@@ -451,13 +466,13 @@ class TFLevitMLPLayer(tf.keras.layers.Layer):
     MLP Layer with `2X` expansion in contrast to ViT with `4X`.
     """
 
-    def __init__(self, input_dim, hidden_dim, **kwargs):
-        super().__init__(**kwargs)
-        self.linear_up = TFMLPLayerWithBN(input_dim, hidden_dim, name="linear_up")
+    def __init__(self, input_dim, hidden_dim, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.linear_up = TFMLPLayerWithBN(input_dim=input_dim, output_dim=hidden_dim, name="linear_up")
         self.activation = hard_swish
-        self.linear_down = TFMLPLayerWithBN(hidden_dim, input_dim, name="linear_down")
+        self.linear_down = TFMLPLayerWithBN(input_dim=hidden_dim, output_dim=input_dim, name="linear_down")
 
-    def call(self, hidden_state, training=None):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         hidden_state = self.linear_up(hidden_state, training=training)
         hidden_state = self.activation(hidden_state)
         hidden_state = self.linear_down(hidden_state, training=training)
@@ -469,16 +484,18 @@ class TFLevitResidualLayer(tf.keras.layers.Layer):
     Residual Block for TFLeViT
     """
 
-    def __init__(self, module, drop_rate, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, module, drop_rate, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.module = module
         self.drop_rate = drop_rate
 
-    def call(self, hidden_state, training=None):
-        if training and self.drop_rate > 0:
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
+        if training and self.drop_rate > 0.0:
             rnd = tf.random.normal(shape=(tf.shape(hidden_state)[0], 1, 1), minval=0, maxval=1)
             rnd = tf.math.greater(rnd, self.drop_rate)
             rnd = tf.math.divide(rnd, (1 - self.drop_rate))
+            # Detach the gradient from `rnd`.
+            tf.stop_gradient(rnd)
             hidden_state = hidden_state + self.module(hidden_state) * rnd
             return hidden_state
         else:
@@ -503,30 +520,45 @@ def __init__(
         mlp_ratio,
         down_ops,
         resolution_in,
+        *args,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        super().__init__(*args, **kwargs)
         self.layers = []
         self.config = config
         self.resolution_in = resolution_in
-        # resolution_in is the intial resolution, resolution_out is final resolution after downsampling
-        for index in range(depths):
+        # `resolution_in` is the intial resolution, `resolution_out` is final resolution after downsampling
+        index = 0
+        for _ in range(depths):
             self.layers.append(
                 TFLevitResidualLayer(
-                    TFLevitAttention(hidden_sizes, key_dim, num_attention_heads, attention_ratio, resolution_in),
-                    self.config.drop_path_rate,
+                    module=TFLevitAttention(
+                        hidden_sizes=hidden_sizes,
+                        key_dim=key_dim,
+                        num_attention_heads=num_attention_heads,
+                        attention_ratio=attention_ratio,
+                        resolution=resolution_in,
+                        name="module",
+                    ),
+                    drop_rate=self.config.drop_path_rate,
                     name=f"layers.{index}",
                 )
             )
+            index += 1  # Increment the index by 1
             if mlp_ratio > 0:
                 hidden_dim = hidden_sizes * mlp_ratio
                 self.layers.append(
                     TFLevitResidualLayer(
-                        TFLevitMLPLayer(hidden_sizes, hidden_dim),
-                        self.config.drop_path_rate,
+                        module=TFLevitMLPLayer(
+                            input_dim=hidden_sizes,
+                            hidden_dim=hidden_dim,
+                            name="module",
+                        ),
+                        drop_rate=self.config.drop_path_rate,
                         name=f"layers.{index}",
                     )
                 )
+                index += 1  # Increment the index by 1
 
         if down_ops[0] == "Subsample":
             self.resolution_out = (self.resolution_in - 1) // down_ops[5] + 1
@@ -539,24 +571,28 @@ def __init__(
                     stride=down_ops[5],
                     resolution_in=resolution_in,
                     resolution_out=self.resolution_out,
-                    name=f"layers.{idx}",
+                    name=f"layers.{index}",
                 )
             )
+            index += 1  # Increment the index by 1
             self.resolution_in = self.resolution_out
             if down_ops[4] > 0:
                 hidden_dim = self.config.hidden_sizes[idx + 1] * down_ops[4]
                 self.layers.append(
                     TFLevitResidualLayer(
-                        TFLevitMLPLayer(self.config.hidden_sizes[idx + 1], hidden_dim),
-                        self.config.drop_path_rate,
-                        name=f"layers.{idx}",
-                    )
+                        module=TFLevitMLPLayer(
+                            input_dim=self.config.hidden_sizes[idx + 1], hidden_dim=hidden_dim, name="module"
+                        ),
+                        drop_rate=self.config.drop_path_rate,
+                        name=f"layers.{index}",
+                    ),
                 )
+                index += 1  # Increment the index by 1
 
     def get_resolution(self):
         return self.resolution_in
 
-    def call(self, hidden_state):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         for layer in self.layers:
             hidden_state = layer(hidden_state)
         return hidden_state
@@ -567,38 +603,43 @@ class TFLevitEncoder(tf.keras.layers.Layer):
     LeViT Encoder consisting of multiple `TFLevitStage` stages.
     """
 
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.config = config
         resolution = self.config.image_size // self.config.patch_size
         self.stages = []
         self.config.down_ops.append([""])
 
-        # TODO ariG23498: add the index values to the layer names
         for stage_idx in range(len(config.depths)):
             stage = TFLevitStage(
-                config,
-                stage_idx,
-                config.hidden_sizes[stage_idx],
-                config.key_dim[stage_idx],
-                config.depths[stage_idx],
-                config.num_attention_heads[stage_idx],
-                config.attention_ratio[stage_idx],
-                config.mlp_ratio[stage_idx],
-                config.down_ops[stage_idx],
-                resolution,
+                config=config,
+                idx=stage_idx,
+                hidden_sizes=config.hidden_sizes[stage_idx],
+                key_dim=config.key_dim[stage_idx],
+                depths=config.depths[stage_idx],
+                num_attention_heads=config.num_attention_heads[stage_idx],
+                attention_ratio=config.attention_ratio[stage_idx],
+                mlp_ratio=config.mlp_ratio[stage_idx],
+                down_ops=config.down_ops[stage_idx],
+                resolution_in=resolution,
                 name=f"stages.{stage_idx}",
             )
             resolution = stage.get_resolution()
             self.stages.append(stage)
 
-    def call(self, hidden_state, output_hidden_states=False, return_dict=True, training=None):
+    def call(
+        self,
+        hidden_state: tf.Tensor,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        training: Optional[bool] = None,
+    ):
         all_hidden_states = () if output_hidden_states else None
 
         for stage in self.stages:
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_state,)
-            hidden_state = stage(hidden_state)
+            hidden_state = stage(hidden_state, training=training)
 
         if output_hidden_states:
             all_hidden_states = all_hidden_states + (hidden_state,)
@@ -613,67 +654,19 @@ class TFLevitClassificationLayer(tf.keras.layers.Layer):
     LeViT Classification Layer
     """
 
-    def __init__(self, input_dim, output_dim):
-        super().__init__()
+    def __init__(self, input_dim, output_dim, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
         self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
-        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
+        self.linear = tf.keras.layers.Dense(units=output_dim, name="linear")
 
-    def call(self, hidden_state, training=None):
+    def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         hidden_state = self.batch_norm(hidden_state, training=training)
         logits = self.linear(hidden_state, training=training)
         return logits
 
 
-@keras_serializable
-class TFLevitMainLayer(tf.keras.layers.Layer):
-    config_class = LevitConfig
-
-    def __init__(self, config, **kwargs):
-        super().__init__(**kwargs)
-        self.config = config
-        self.patch_embeddings = TFLevitPatchEmbeddings(config, name="patch_embeddings")
-        self.encoder = TFLevitEncoder(config, name="encoder")
-
-    @unpack_inputs
-    def call(
-        self,
-        pixel_values: tf.Tensor = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        training: Optional[bool] = None,
-    ):
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        # Apply patch embeddings to the pixel values
-        embeddings = self.patch_embeddings(pixel_values, training=training)
-
-        # Apply encoder to the encoded pixel values
-        encoder_outputs = self.encoder(
-            embeddings,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            training=training,
-        )
-
-        # Obtain the `last_hidden_state`
-        last_hidden_state = encoder_outputs[0]  # encoder_outputs.last_hidden_state
-
-        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
-        pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return TFBaseModelOutputWithPoolingAndNoAttention(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,  # only if the `output_hidden_states` is set to True
-        )
-
-
 class TFLevitPreTrainedModel(TFPreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -717,6 +710,54 @@ def serving(self, inputs):
         return self.serving_output(output)
 
 
+@keras_serializable
+class TFLevitMainLayer(tf.keras.layers.Layer):
+    config_class = LevitConfig
+
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.config = config
+        self.patch_embeddings = TFLevitPatchEmbeddings(config=config, name="patch_embeddings")
+        self.encoder = TFLevitEncoder(config=config, name="encoder")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: Optional[bool] = None,
+    ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndNoAttention]:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Apply patch embeddings to the pixel values
+        embeddings = self.patch_embeddings(pixel_values, training=training)
+
+        # Apply encoder to the encoded pixel values
+        encoder_outputs = self.encoder(
+            embeddings,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        # Obtain the `last_hidden_state`
+        last_hidden_state = encoder_outputs[0]  # encoder_outputs.last_hidden_state
+
+        # global average pooling, (batch_size, seq_length, hidden_sizes) -> (batch_size, hidden_sizes)
+        pooled_output = tf.math.reduce_mean(last_hidden_state, axis=1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,  # only if the `output_hidden_states` is set to True
+        )
+
+
 LEVIT_START_DOCSTRING = r"""
 
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
@@ -780,8 +821,8 @@ def serving(self, inputs):
     LEVIT_START_DOCSTRING,
 )
 class TFLevitModel(TFLevitPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
 
         self.levit = TFLevitMainLayer(config=config, name="levit")
 
@@ -832,15 +873,17 @@ def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) ->
     LEVIT_START_DOCSTRING,
 )
 class TFLevitForImageClassification(TFLevitPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
-        self.levit = TFLevitMainLayer(config, name="levit")
+        self.levit = TFLevitMainLayer(config=config, name="levit")
 
         # Classifier head
         self.classifier = (
-            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier")
+            TFLevitClassificationLayer(
+                input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier"
+            )
             if config.num_labels > 0
             else tf.identity
         )
@@ -888,8 +931,7 @@ def call(
             if self.config.problem_type is None:
                 if self.num_labels == 1:
                     self.config.problem_type = "regression"
-                # TODO @ariG23498: Check with the dtypes (long and int in torch)
-                elif self.num_labels > 1 and (labels.dtype == tf.float64 or labels.dtype == tf.int64):
+                elif self.num_labels > 1 and (labels.dtype == tf.int64 or labels.dtype == tf.int32):
                     self.config.problem_type = "single_label_classification"
                 else:
                     self.config.problem_type = "multi_label_classification"
@@ -927,20 +969,24 @@ def call(
     LEVIT_START_DOCSTRING,
 )
 class TFLevitForImageClassificationWithTeacher(TFLevitPreTrainedModel):
-    def __init__(self, config, **kwargs):
-        super().__init__(config, **kwargs)
+    def __init__(self, config, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
         self.config = config
         self.num_labels = config.num_labels
         self.levit = TFLevitMainLayer(config, name="levit")
 
         # Classifier head
         self.classifier = (
-            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier")
+            TFLevitClassificationLayer(
+                input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier"
+            )
             if config.num_labels > 0
             else tf.identity
         )
         self.classifier_distill = (
-            TFLevitClassificationLayer(config.hidden_sizes[-1], config.num_labels, name="classifier_distill")
+            TFLevitClassificationLayer(
+                input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier_distill"
+            )
             if config.num_labels > 0
             else tf.identity
         )

From fdb690756232dc6c683fd21ac122b80509d674f9 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Fri, 25 Nov 2022 15:19:36 +0530
Subject: [PATCH 10/11] chore: making mdx changes and adding the tf model to
 various inits

---
 docs/source/en/index.mdx                      |  2 +-
 docs/source/en/model_doc/levit.mdx            |  3 +-
 src/transformers/__init__.py                  | 16 ++++++++++
 .../models/auto/modeling_tf_auto.py           |  1 +
 src/transformers/models/levit/__init__.py     | 30 ++++++++++++++++++-
 .../models/levit/modeling_tf_levit.py         |  2 +-
 6 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 790ce8f4d176..7b9820bb2ad9 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -277,7 +277,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          LayoutLMv2           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          LayoutLMv3           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |              LED              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-|             LeViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|             LeViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             LiLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Longformer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            LongT5             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
diff --git a/docs/source/en/model_doc/levit.mdx b/docs/source/en/model_doc/levit.mdx
index 1ebe93ff3ff7..017a97af7328 100644
--- a/docs/source/en/model_doc/levit.mdx
+++ b/docs/source/en/model_doc/levit.mdx
@@ -59,7 +59,8 @@ Tips:
 - You can check out demo notebooks regarding inference as well as fine-tuning on custom data [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer)
   (you can just replace [`ViTFeatureExtractor`] by [`LevitFeatureExtractor`] and [`ViTForImageClassification`] by [`LevitForImageClassification`] or [`LevitForImageClassificationWithTeacher`]).
 
-This model was contributed by [anugunj](https://huggingface.co/anugunj). The original code can be found [here](https://github.com/facebookresearch/LeViT).
+This model was contributed by [anugunj](https://huggingface.co/anugunj). The TensorFlow version was contributed by 
+[Aritra Roy Gosthipaty](https://huggingface.co/ariG23498). The original code can be found [here](https://github.com/facebookresearch/LeViT).
 
 
 ## LevitConfig
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 9c5f33bea535..503673648e7f 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2696,6 +2696,15 @@
         ]
     )
     _import_structure["models.led"].extend(["TFLEDForConditionalGeneration", "TFLEDModel", "TFLEDPreTrainedModel"])
+    _import_structure["models.levit"].extend(
+        [
+            "TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFLevitForImageClassification",
+            "TFLevitForImageClassificationWithTeacher",
+            "TFLevitModel",
+            "TFLevitPreTrainedModel",
+        ]
+    )
     _import_structure["models.longformer"].extend(
         [
             "TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -5472,6 +5481,13 @@
             TFLayoutLMv3PreTrainedModel,
         )
         from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
+        from .models.levit import (
+            TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFLevitForImageClassification,
+            TFLevitForImageClassificationWithTeacher,
+            TFLevitModel,
+            TFLevitPreTrainedModel,
+        )
         from .models.longformer import (
             TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFLongformerForMaskedLM,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 8bb7b5595f35..ad18273430a9 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -57,6 +57,7 @@
         ("layoutlm", "TFLayoutLMModel"),
         ("layoutlmv3", "TFLayoutLMv3Model"),
         ("led", "TFLEDModel"),
+        ("levit", "TFLevitModel"),
         ("longformer", "TFLongformerModel"),
         ("lxmert", "TFLxmertModel"),
         ("marian", "TFMarianModel"),
diff --git a/src/transformers/models/levit/__init__.py b/src/transformers/models/levit/__init__.py
index f42fb02ad071..9cce4e7f3cf8 100644
--- a/src/transformers/models/levit/__init__.py
+++ b/src/transformers/models/levit/__init__.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_tf_available, is_vision_available
 
 
 _import_structure = {"configuration_levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig", "LevitOnnxConfig"]}
@@ -45,6 +45,20 @@
         "LevitPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_levit"] = [
+        "TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFLevitForImageClassification",
+        "TFLevitForImageClassificationWithTeacher",
+        "TFLevitModel",
+        "TFLevitPreTrainedModel",
+    ]
+
 
 if TYPE_CHECKING:
     from .configuration_levit import LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, LevitConfig, LevitOnnxConfig
@@ -71,6 +85,20 @@
             LevitModel,
             LevitPreTrainedModel,
         )
+    
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_levit import (
+            TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFLevitForImageClassification,
+            TFLevitForImageClassificationWithTeacher,
+            TFLevitModel,
+            TFLevitPreTrainedModel,
+        )
 else:
     import sys
 
diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 4c8af385b2e5..8fca39106989 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -49,7 +49,7 @@
 _IMAGE_CLASS_CHECKPOINT = "facebook/levit-128S"
 _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
 
-LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = [
     "facebook/levit-128S",
     # See all LeViT models at https://huggingface.co/models?filter=levit
 ]

From 95ffed10ebb6d79996e0475ec83b4bb40e8615f8 Mon Sep 17 00:00:00 2001
From: ariG23498 <aritra.born2fly@gmail.com>
Date: Mon, 28 Nov 2022 17:17:48 +0530
Subject: [PATCH 11/11] chore: changing the defaults of BN layers and applying
 style fixup

---
 src/transformers/models/levit/__init__.py     | 10 ++-
 .../models/levit/modeling_tf_levit.py         | 65 +++++++++++--------
 src/transformers/utils/dummy_tf_objects.py    | 31 +++++++++
 3 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/levit/__init__.py b/src/transformers/models/levit/__init__.py
index 9cce4e7f3cf8..7a52103e6d4d 100644
--- a/src/transformers/models/levit/__init__.py
+++ b/src/transformers/models/levit/__init__.py
@@ -17,7 +17,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_tf_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {"configuration_levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig", "LevitOnnxConfig"]}
@@ -85,7 +91,7 @@
             LevitModel,
             LevitPreTrainedModel,
         )
-    
+
     try:
         if not is_tf_available():
             raise OptionalDependencyNotAvailable()
diff --git a/src/transformers/models/levit/modeling_tf_levit.py b/src/transformers/models/levit/modeling_tf_levit.py
index 8fca39106989..a66f2cd59436 100644
--- a/src/transformers/models/levit/modeling_tf_levit.py
+++ b/src/transformers/models/levit/modeling_tf_levit.py
@@ -113,13 +113,15 @@ def __init__(
             name="convolution",
         )
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
-        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.9, name="batch_norm")
 
     def call(self, embeddings: tf.Tensor, training: Optional[bool] = None):
+        # embeddings shape = (bsz, num_channels, height, width)
         embeddings = tf.transpose(embeddings, perm=(0, 2, 3, 1))
         embeddings = self.padding(embeddings)
         embeddings = self.convolution(embeddings, training=training)
         embeddings = self.batch_norm(embeddings, training=training)
+        # embeddings shape = (bsz, height, width, num_channels)
         embeddings = tf.transpose(embeddings, perm=(0, 3, 1, 2))
         return embeddings
 
@@ -205,9 +207,17 @@ def call(self, pixel_values: tf.Tensor, training: Optional[bool] = None):
 class TFMLPLayerWithBN(tf.keras.layers.Layer):
     def __init__(self, input_dim, output_dim, bn_weight_init=1, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.linear = tf.keras.layers.Dense(units=output_dim, use_bias=False, name="linear")
+        self.linear = tf.keras.layers.Dense(
+            units=output_dim,
+            use_bias=False,
+            name="linear",
+        )
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
-        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+        self.batch_norm = tf.keras.layers.BatchNormalization(
+            epsilon=1e-05,
+            momentum=0.9,
+            name="batch_norm",
+        )
 
     def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         hidden_state = self.linear(hidden_state, training=training)
@@ -218,6 +228,7 @@ def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         hidden_state_reshape_list = [
             hidden_state_shape_list[0] * hidden_state_shape_list[1]
         ] + hidden_state_shape_list[2:]
+
         flattened_hidden_state = tf.reshape(hidden_state, shape=hidden_state_reshape_list)
         batch_norm_hidden_state = self.batch_norm(flattened_hidden_state, training=training)
 
@@ -228,7 +239,7 @@ def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
 
 class TFLevitSubsample(tf.keras.layers.Layer):
     """
-    Layer to subsample the activatioin maps
+    Layer to subsample the activatioin maps.
     """
 
     def __init__(self, stride, resolution, *args, **kwargs):
@@ -272,20 +283,18 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio,
         self.len_points = len(points)
 
         # Initialize the attention offsets and indices
-        attention_offsets, indices = {}, []
+        self.attention_offsets, self.indices = {}, []
 
         # Iterate over the `points`` generator and calculate the offset between the initial
         # point (0, 0) and the rest of the points [(0, 1), (0, 2)...]
         for p1 in points:  # this iterates only once, wehre p1 is (0, 0)
             for p2 in points:  # iterate over all the points other than (0, 0)
                 offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
-                if offset not in attention_offsets:
-                    attention_offsets[offset] = len(attention_offsets)
-                indices.append(attention_offsets[offset])
+                if offset not in self.attention_offsets:
+                    self.attention_offsets[offset] = len(self.attention_offsets)
+                self.indices.append(self.attention_offsets[offset])
 
-        # Store the attention offsets, indices and attention bias cache
-        self.attention_offsets = attention_offsets
-        self.indices = indices
+        # Store attention bias cache
         self.attention_bias_cache = {}
 
     def build(self, input_shape: tf.TensorShape):
@@ -317,8 +326,8 @@ def get_attention_biases(self, device, training: Optional[bool] = None):
     def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
 
         # TODO: figure out the clearing cache mechanism
-        if training and self.attention_bias_cache:
-            self.attention_bias_cache = {}  # clear ab cache
+        # if training and self.attention_bias_cache:
+        #     self.attention_bias_cache = {}  # clear ab cache
 
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
@@ -427,8 +436,8 @@ def get_attention_biases(self, device, training: Optional[bool] = None):
     def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
 
         # TODO: figure out the clearing cache mechanism
-        if training and self.attention_bias_cache:
-            self.attention_bias_cache = {}  # clear ab cache
+        # if training and self.attention_bias_cache:
+        #     self.attention_bias_cache = {}  # clear ab cache
 
         batch_size = tf.shape(hidden_state)[0]
         seq_length = tf.shape(hidden_state)[1]
@@ -594,7 +603,7 @@ def get_resolution(self):
 
     def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
         for layer in self.layers:
-            hidden_state = layer(hidden_state)
+            hidden_state = layer(hidden_state, training=training)
         return hidden_state
 
 
@@ -658,7 +667,7 @@ def __init__(self, input_dim, output_dim, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # The epsilon and momentum used here are the defaults in torch batch norm layer.
-        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.1, name="batch_norm")
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-05, momentum=0.9, name="batch_norm")
         self.linear = tf.keras.layers.Dense(units=output_dim, name="linear")
 
     def call(self, hidden_state: tf.Tensor, training: Optional[bool] = None):
@@ -823,7 +832,6 @@ def call(
 class TFLevitModel(TFLevitPreTrainedModel):
     def __init__(self, config, *args, **kwargs):
         super().__init__(config, *args, **kwargs)
-
         self.levit = TFLevitMainLayer(config=config, name="levit")
 
     @unpack_inputs
@@ -849,10 +857,8 @@ def call(
             return_dict=return_dict,
             training=training,
         )
-
         return outputs
 
-    # TODO @ariG23498: Check the output type for serving.
     def serving_output(self, output: TFBaseModelOutputWithPoolingAndNoAttention) -> TFBaseModelOutputWithPooling:
         hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
         attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
@@ -885,7 +891,7 @@ def __init__(self, config, *args, **kwargs):
                 input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier"
             )
             if config.num_labels > 0
-            else tf.identity
+            else tf.keras.layers.Activation("linear", name="classifier")
         )
 
     @unpack_inputs
@@ -978,17 +984,21 @@ def __init__(self, config, *args, **kwargs):
         # Classifier head
         self.classifier = (
             TFLevitClassificationLayer(
-                input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier"
+                input_dim=config.hidden_sizes[-1],
+                output_dim=config.num_labels,
+                name="classifier",
             )
             if config.num_labels > 0
-            else tf.identity
+            else tf.keras.layers.Activation("linear", name="classifier")
         )
         self.classifier_distill = (
             TFLevitClassificationLayer(
-                input_dim=config.hidden_sizes[-1], output_dim=config.num_labels, name="classifier_distill"
+                input_dim=config.hidden_sizes[-1],
+                output_dim=config.num_labels,
+                name="classifier_distill",
             )
             if config.num_labels > 0
-            else tf.identity
+            else tf.keras.layers.Activation("linear", name="classifier_distill")
         )
 
     @unpack_inputs
@@ -1020,9 +1030,8 @@ def call(
         sequence_output = tf.math.reduce_mean(sequence_output, axis=1)
 
         # Apply the classifier heads and obtain the `cls_logits` and `distill_logits`
-        cls_logits, distill_logits = self.classifier(sequence_output, training=training), self.classifier_distill(
-            sequence_output, training=training
-        )
+        cls_logits = self.classifier(sequence_output, training=training)
+        distill_logits = self.classifier_distill(sequence_output, training=training)
 
         # According to the paper, the cls and distill logits are averaged
         logits = (cls_logits + distill_logits) / 2
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index d16a75591d62..a72fa1db2461 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1499,6 +1499,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_LEVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFLevitForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLevitForImageClassificationWithTeacher(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLevitModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFLevitPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None