From b5918877776b6c9b114a9c79be8f2a0227ac24be Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 10:52:27 +0530
Subject: [PATCH 01/29] add: segformer utils and img. classification.

---
 .../models/segformer/modeling_tf_segformer.py | 678 ++++++++++++++++++
 1 file changed, 678 insertions(+)
 create mode 100644 src/transformers/models/segformer/modeling_tf_segformer.py

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
new file mode 100644
index 000000000000..ea30c322ef1f
--- /dev/null
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -0,0 +1,678 @@
+# coding=utf-8
+# Copyright 2022 NVIDIA The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TensorFlow SegFormer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSequenceClassifierOutput
+from ...modeling_tf_utils import TFPreTrainedModel, keras_serializable, unpack_inputs
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import logging
+from .configuration_segformer import SegformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SegformerConfig"
+_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
+_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "nvidia/segformer-b0-finetuned-ade-512-512",
+    # See all SegFormer models at https://huggingface.co/models?filter=segformer
+]
+
+
+@dataclass
+class TFSegFormerImageClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of image classification models.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
+            feature maps) of the model at the output of each stage.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[tf.Tensor] = None
+    logits: tf.Tensor = None
+    hidden_states: Optional[Tuple[tf.Tensor]] = None
+    attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer
+class TFSegformerDropPath(tf.keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
+    """Construct the overlapping patch embeddings."""
+
+    def __init__(self, patch_size, stride, hidden_size, **kwargs):
+        super().__init__(**kwargs)
+        self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2)
+        self.proj = tf.keras.layers.Conv2d(
+            filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
+        )
+
+        self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm")
+
+    def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
+        embeddings = self.proj(self.padding(pixel_values))
+        height = shape_list(embeddings)[1]
+        width = shape_list(embeddings)[2]
+        hidden_dim = shape_list(embeddings)[3]
+        # (batch_size, height, width, num_channels) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim))
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+
+
+class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://arxiv.org/abs/2102.12122)."""
+
+    def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, **kwargs):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = self.hidden_size // self.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = tf.keras.layers.Dense(self.all_head_size, name="query")
+        self.key = tf.keras.layers.Dense(self.all_head_size, name="key")
+        self.value = tf.keras.layers.Dense(self.all_head_size, name="value")
+
+        self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = tf.keras.layers.Conv2D(
+                filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr"
+            )
+            self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm")
+
+    def transpose_for_scores(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        batch_size = shape_list(hidden_states)[0]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        if self.sr_ratio > 1:
+            batch_size = shape_list(hidden_states)[0]
+            seq_len = shape_list(hidden_states)[1]
+            num_channels = shape_list(hidden_states)[2]
+            # Reshape to (batch_size, height, width, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels))
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+
+        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, dk)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size))
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+
+
+class TFSegformerSelfOutput(tf.keras.layers.Layer):
+    def __init__(self, config, hidden_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = tf.keras.layers.Dense(hidden_size, name="dense")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class TFSegformerAttention(tf.keras.layers.Layer):
+    def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, **kwargs):
+        super().__init__(**kwargs)
+        self.self = TFSegformerEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="self",
+        )
+        self.output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
+
+    def call(
+        self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
+    ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFSegformerDWConv(tf.keras.layers.Layer):
+    def __init__(self, dim: int = 768, **kwargs):
+        super().__init__(**kwargs)
+        self.dwconv = tf.keras.layers.Conv2d(
+            filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
+        )
+
+    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
+        batch_size = shape_list(hidden_states)[0]
+        num_channels = shape_list(hidden_states)[-1]
+        hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+        hidden_states = self.dwconv(hidden_states)
+
+        new_height = shape_list(hidden_states)[1]
+        new_width = shape_list(hidden_states)[2]
+        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, -1))
+
+        return hidden_states
+
+
+class TFSegformerMixFFN(tf.keras.layers.Layer):
+    def __init__(self, config, in_features: int, hidden_features: int = None, out_features: int = None, **kwargs):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1")
+        self.dwconv = TFSegformerDWConv(hidden_features, name="dwconv")
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = tf.keras.layers.Dense(out_features, name="dense2")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class TFSegformerLayer(tf.keras.layers.Layer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_attention_heads: int,
+        drop_path: int,
+        sequence_reduction_ratio: int,
+        mlp_ratio: int,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.layer_norm_1 = tf.keras.layers.LayerNormalization(name="layer_norm_1")
+        self.attention = TFSegformerAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="attention",
+        )
+        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else tf.identity
+        self.layer_norm_2 = tf.keras.layers.LayerNormalization(name="layer_norm_2")
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
+
+    def call(
+        self,
+        hidden_states: tf.tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ):
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection (with stochastic depth)
+        attention_output = self.drop_path(attention_output, training=training)
+        hidden_states = attention_output + hidden_states
+
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+
+        # second residual connection (with stochastic depth)
+        mlp_output = self.drop_path(mlp_output, training=training)
+        layer_output = mlp_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TFSegformerEncoder(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        # stochastic depth decay rule
+        drop_path_decays = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
+        drop_path_decays = tf.split(drop_path_decays, config.depths)
+        drop_path_decays = [x.numpy().tolist() for x in drop_path_decays]
+
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                TFSegformerOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                    name=f"embeddings.{i}",
+                )
+            )
+        self.embeddings = embeddings
+        # self.patch_embeddings = nn.ModuleList(embeddings)
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    TFSegformerLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                        name=f"block.{i}.blocks.{j}",
+                    )
+                )
+            blocks.append(layers)
+
+        # self.block = nn.ModuleList(blocks)
+        self.block = blocks
+
+        # Layer norms
+        self.layer_norms = [
+            tf.keras.layers.LayerNormalization(name=f"layer_norm.{i}") for i in range(config.num_encoder_blocks)
+        ]
+
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = shape_list(pixel_values)[0]
+
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
+            embedding_layers, block_layers, norm_layers = x  # all of these are lists
+            # first, obtain patch embeddings
+            for embedding_layer in embedding_layers:
+                hidden_states, height, width = embedding_layer(hidden_states)
+
+            # second, send embeddings through blocks
+            # (each block consists of multiple layers i.e., list of layers)
+            for i, blk in enumerate(block_layers):
+                for module in blk:
+                    layer_outputs = module(
+                        hidden_states,
+                        height,
+                        width,
+                        output_attentions,
+                        training=training,
+                    )
+                    hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+            # third, apply layer norm
+            for norm_layer in norm_layers:
+                hidden_states = norm_layer(hidden_states)
+
+            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
+            if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
+                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, -1))
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@keras_serializable
+class TFSegformerMainLayer(tf.keras.layers.Layer):
+    config_class = SegformerConfig
+
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        # hierarchical Transformer encoder
+        self.encoder = TFSegformerEncoder(config, name="encoder")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TFSegformerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SegformerConfig
+    base_model_prefix = "segformer"
+    main_input_name = "pixel_values"
+
+    @property
+    def dummy_inputs(self) -> Dict[str, tf.Tensor]:
+        """
+        Dummy inputs to build the network.
+
+        Returns:
+            `Dict[str, tf.Tensor]`: The dummy inputs.
+        """
+        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, 224, 224), dtype=tf.float32)
+        return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
+
+    @tf.function(
+        input_signature=[
+            {
+                "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"),
+            }
+        ]
+    )
+    def serving(self, inputs):
+        """
+        Method used for serving the model.
+
+        Args:
+            inputs (`Dict[str, tf.Tensor]`):
+                The input of the saved model as a dictionary of tensors.
+        """
+        return self.call(inputs)
+
+
+SEGFORMER_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SEGFORMER_INPUTS_DOCSTRING = r"""
+
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See
+            [`AutoFeatureExtractor.__call__`] for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
+            in eager mode, in graph mode the value will always be set to True.
+
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerModel(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[Tuple, TFBaseModelOutput]:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    states) e.g. for ImageNet.
+    """,
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForImageClassification(TFSegformerPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+
+        # Classifier head
+        self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFSequenceClassifierOutput]:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # convert last hidden states to (batch_size, height*width, hidden_size)
+        batch_size = shape_list(sequence_output)[0]
+        sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))
+
+        # global average pooling
+        sequence_output = tf.reduce_mean(sequence_output, axis=1)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )

From a6f87814d2880c0d75a6ee68174ee63f39703154 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 11:57:41 +0530
Subject: [PATCH 02/29] add: segmentation layer.

---
 .../models/segformer/modeling_tf_segformer.py | 199 ++++++++++++++++--
 1 file changed, 179 insertions(+), 20 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index ea30c322ef1f..d1edb18f6649 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -27,7 +27,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_tf_outputs import TFBaseModelOutput, TFSequenceClassifierOutput
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
 from ...modeling_tf_utils import TFPreTrainedModel, keras_serializable, unpack_inputs
 from ...tf_utils import shape_list, stable_softmax
 from ...utils import logging
@@ -371,7 +371,7 @@ def __init__(self, config, **kwargs):
                     stride=config.strides[i],
                     num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                     hidden_size=config.hidden_sizes[i],
-                    name=f"embeddings.{i}",
+                    name=f"patch_embeddings.{i}",
                 )
             )
         self.embeddings = embeddings
@@ -394,7 +394,7 @@ def __init__(self, config, **kwargs):
                         drop_path=drop_path_decays[cur + j],
                         sequence_reduction_ratio=config.sr_ratios[i],
                         mlp_ratio=config.mlp_ratios[i],
-                        name=f"block.{i}.blocks.{j}",
+                        name=f"block.{i}.{j}",
                     )
                 )
             blocks.append(layers)
@@ -422,31 +422,28 @@ def call(
 
         hidden_states = pixel_values
         for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
-            embedding_layers, block_layers, norm_layers = x  # all of these are lists
+            embedding_layer, block_layer, norm_layer = x  # all of these are lists
             # first, obtain patch embeddings
-            for embedding_layer in embedding_layers:
-                hidden_states, height, width = embedding_layer(hidden_states)
+            hidden_states, height, width = embedding_layer(hidden_states)
 
             # second, send embeddings through blocks
             # (each block consists of multiple layers i.e., list of layers)
-            for i, blk in enumerate(block_layers):
-                for module in blk:
-                    layer_outputs = module(
-                        hidden_states,
-                        height,
-                        width,
-                        output_attentions,
-                        training=training,
-                    )
-                    hidden_states = layer_outputs[0]
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(
+                    hidden_states,
+                    height,
+                    width,
+                    output_attentions,
+                    training=training,
+                )
+                hidden_states = layer_outputs[0]
                 if output_attentions:
                     all_self_attentions = all_self_attentions + (layer_outputs[1],)
 
             # third, apply layer norm
-            for norm_layer in norm_layers:
-                hidden_states = norm_layer(hidden_states)
+            hidden_states = norm_layer(hidden_states)
 
-            # fourth, optionally reshape back to (batch_size, num_channels, height, width)
+            # fourth, optionally reshape back to (batch_size, height, width, num_channels)
             if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
                 hidden_states = tf.reshape(hidden_states, (batch_size, height, width, -1))
 
@@ -488,6 +485,11 @@ def call(
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
         encoder_outputs = self.encoder(
             pixel_values,
             output_attentions=output_attentions,
@@ -641,7 +643,7 @@ def __init__(self, config, *inputs, **kwargs):
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
+    def call(
         self,
         pixel_values: Optional[tf.Tensor] = None,
         labels: Optional[tf.Tensor] = None,
@@ -676,3 +678,160 @@ def forward(
         return TFSequenceClassifierOutput(
             loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
         )
+
+
+class TFSegformerMLP(tf.keras.layers.Layer):
+    """
+    Linear Embedding.
+    """
+
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_dim = shape_list(hidden_states)[-1]
+        height = shape_list(hidden_states)[1]
+        width = shape_list(hidden_states)[2]
+        hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim))
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+
+class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = TFSegformerMLP(config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
+            mlps.append(mlp)
+        self.mlps = mlps
+
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = tf.keras.layers.Conv2d(
+            filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
+        )
+        self.batch_norm = tf.keras.layers.BatchNormalization(name="batch_norm")
+        self.activation = tf.keras.layers.Activation("relu")
+
+        self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
+        self.classifier = tf.keras.layers.Conv2d(filters=config.num_labels, kernel_size=1, name="classifier")
+
+        self.config = config
+
+    def call(self, encoder_hidden_states: tf.Tensor):
+        batch_size = encoder_hidden_states[-1].shape[0]
+
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
+            if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
+                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[-1], tf.float32))
+                height = width = tf.cast(height, tf.int32)
+                encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+
+            # unify channel dimension
+            height = shape_list(encoder_hidden_state)[1]
+            width = shape_list(encoder_hidden_state)[2]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+            # upsample
+            encoder_hidden_state = tf.image.resize(
+                encoder_hidden_state, size=shape_list(encoder_hidden_states[0])[1:-1], method="bilinear"
+            )
+            all_hidden_states += (encoder_hidden_state,)
+
+        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=1))
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        # logits of shape (batch_size, height/4, width/4, num_labels)
+        logits = self.classifier(hidden_states)
+
+        return logits
+
+
+@add_start_docstrings(
+    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+        self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
+
+    def compute_loss(self, logits, labels):
+        # upsample logits to the images' original size
+        if len(shape_list(labels)) > 3:
+            label_interp_shape = shape_list(labels)[1:-1]
+        else:
+            label_interp_shape = shape_list(labels)[-2:]
+
+        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
+        # compute weighted loss
+        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
+
+        # Taken from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
+        # Utility to mask the index to ignore during computing the loss.
+        def masked_loss(real, pred):
+            mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
+            loss_ = loss_fct(real, pred)
+            mask = tf.cast(mask, dtype=loss_.dtype)
+            loss_ *= mask
+
+            return tf.reduce_sum(loss_) / tf.reduce_sum(mask)
+
+        return masked_loss(labels, upsampled_logits)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TFSemanticSegmenterOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.decode_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                raise ValueError("The number of labels should be greater than one")
+            else:
+                loss = self.compute_loss(logits, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        # make logits of shape (batch_size, num_labels, height/4, width/4) to
+        # keep them consistent across APIs
+        logits = tf.transpose(logits, perm=[0, 3, 1, 2])
+        return TFSemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )

From 4ebb62a6815cb20d87b8fd626a70cb369ec1e97c Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 15:10:26 +0530
Subject: [PATCH 03/29] feat: working implementation of segformer.

---
 .../models/segformer/modeling_tf_segformer.py | 135 ++++++++++--------
 1 file changed, 72 insertions(+), 63 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index d1edb18f6649..fd2124a042d4 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -15,14 +15,13 @@
 """ TensorFlow SegFormer model."""
 
 import math
-from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union
 
 import tensorflow as tf
 
 from ...activations_tf import get_tf_activation
 from ...file_utils import (
-    ModelOutput,
+    add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
@@ -54,33 +53,6 @@
 ]
 
 
-@dataclass
-class TFSegFormerImageClassifierOutput(ModelOutput):
-    """
-    Base class for outputs of image classification models.
-
-    Args:
-        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Classification (or regression if config.num_labels==1) loss.
-        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
-            the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called
-            feature maps) of the model at the output of each stage.
-        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    loss: Optional[tf.Tensor] = None
-    logits: tf.Tensor = None
-    hidden_states: Optional[Tuple[tf.Tensor]] = None
-    attentions: Optional[Tuple[tf.Tensor]] = None
-
-
 # Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer
 class TFSegformerDropPath(tf.keras.layers.Layer):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
@@ -108,11 +80,11 @@ class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer):
     def __init__(self, patch_size, stride, hidden_size, **kwargs):
         super().__init__(**kwargs)
         self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2)
-        self.proj = tf.keras.layers.Conv2d(
+        self.proj = tf.keras.layers.Conv2D(
             filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
         )
 
-        self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm")
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
 
     def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]:
         embeddings = self.proj(self.padding(pixel_values))
@@ -156,11 +128,11 @@ def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_
             self.sr = tf.keras.layers.Conv2D(
                 filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr"
             )
-            self.layer_norm = tf.keras.layers.LayerNormalization(name="layer_norm")
+            self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
 
-    def transpose_for_scores(self, hidden_states: tf.Tensor) -> tf.Tensor:
+    def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
         # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
-        batch_size = shape_list(hidden_states)[0]
+        batch_size = shape_list(tensor)[0]
         tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
 
         # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
@@ -174,12 +146,13 @@ def call(
         output_attentions: bool = False,
         training: bool = False,
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
+        batch_size = shape_list(hidden_states)[0]
+        seq_len = shape_list(hidden_states)[1]
+        num_channels = shape_list(hidden_states)[2]
+
         query_layer = self.transpose_for_scores(self.query(hidden_states))
 
         if self.sr_ratio > 1:
-            batch_size = shape_list(hidden_states)[0]
-            seq_len = shape_list(hidden_states)[1]
-            num_channels = shape_list(hidden_states)[2]
             # Reshape to (batch_size, height, width, num_channels)
             hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
             # Apply sequence reduction
@@ -236,14 +209,14 @@ def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_
             sequence_reduction_ratio=sequence_reduction_ratio,
             name="self",
         )
-        self.output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
+        self.sa_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
 
     def call(
         self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
         self_outputs = self.self(hidden_states, height, width, output_attentions)
 
-        attention_output = self.output(self_outputs[0], hidden_states)
+        attention_output = self.sa_output(self_outputs[0])
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
@@ -251,7 +224,7 @@ def call(
 class TFSegformerDWConv(tf.keras.layers.Layer):
     def __init__(self, dim: int = 768, **kwargs):
         super().__init__(**kwargs)
-        self.dwconv = tf.keras.layers.Conv2d(
+        self.dwconv = tf.keras.layers.Conv2D(
             filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
         )
 
@@ -275,7 +248,7 @@ def __init__(self, config, in_features: int, hidden_features: int = None, out_fe
         self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1")
         self.dwconv = TFSegformerDWConv(hidden_features, name="dwconv")
         if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = get_tf_activation[config.hidden_act]
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
         else:
             self.intermediate_act_fn = config.hidden_act
         self.dense2 = tf.keras.layers.Dense(out_features, name="dense2")
@@ -299,13 +272,13 @@ def __init__(
         config,
         hidden_size: int,
         num_attention_heads: int,
-        drop_path: int,
+        drop_path: float,
         sequence_reduction_ratio: int,
         mlp_ratio: int,
         **kwargs
     ):
         super().__init__(**kwargs)
-        self.layer_norm_1 = tf.keras.layers.LayerNormalization(name="layer_norm_1")
+        self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
         self.attention = TFSegformerAttention(
             config,
             hidden_size=hidden_size,
@@ -313,19 +286,19 @@ def __init__(
             sequence_reduction_ratio=sequence_reduction_ratio,
             name="attention",
         )
-        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else tf.identity
-        self.layer_norm_2 = tf.keras.layers.LayerNormalization(name="layer_norm_2")
+        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear")
+        self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
         mlp_hidden_size = int(hidden_size * mlp_ratio)
         self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
 
     def call(
         self,
-        hidden_states: tf.tensor,
+        hidden_states: tf.Tensor,
         height: int,
         width: int,
         output_attentions: bool = False,
         training: bool = False,
-    ):
+    ) -> Tuple:
         self_attention_outputs = self.attention(
             self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
             height,
@@ -358,9 +331,7 @@ def __init__(self, config, **kwargs):
         self.config = config
 
         # stochastic depth decay rule
-        drop_path_decays = tf.linspace(0.0, config.drop_path_rate, sum(config.depths))
-        drop_path_decays = tf.split(drop_path_decays, config.depths)
-        drop_path_decays = [x.numpy().tolist() for x in drop_path_decays]
+        drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
 
         # patch embeddings
         embeddings = []
@@ -369,7 +340,6 @@ def __init__(self, config, **kwargs):
                 TFSegformerOverlapPatchEmbeddings(
                     patch_size=config.patch_sizes[i],
                     stride=config.strides[i],
-                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
                     hidden_size=config.hidden_sizes[i],
                     name=f"patch_embeddings.{i}",
                 )
@@ -404,7 +374,8 @@ def __init__(self, config, **kwargs):
 
         # Layer norms
         self.layer_norms = [
-            tf.keras.layers.LayerNormalization(name=f"layer_norm.{i}") for i in range(config.num_encoder_blocks)
+            tf.keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}")
+            for i in range(config.num_encoder_blocks)
         ]
 
     def call(
@@ -527,7 +498,8 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, 224, 224), dtype=tf.float32)
+        # (todo: sayakpaul): change the batch size to 3
+        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(1, self.config.num_channels, 512, 512), dtype=tf.float32)
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
     @tf.function(
@@ -604,7 +576,14 @@ def __init__(self, config: SegformerConfig, *inputs, **kwargs):
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
-    @replace_return_docstrings(output_type=TFBaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
     def call(
         self,
         pixel_values: tf.Tensor,
@@ -642,7 +621,13 @@ def __init__(self, config, *inputs, **kwargs):
 
     @unpack_inputs
     @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=TFSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    @add_code_sample_docstrings(
+        processor_class=_FEAT_EXTRACTOR_FOR_DOC,
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
     def call(
         self,
         pixel_values: Optional[tf.Tensor] = None,
@@ -709,19 +694,19 @@ def __init__(self, config, **kwargs):
         self.mlps = mlps
 
         # the following 3 layers implement the ConvModule of the original implementation
-        self.linear_fuse = tf.keras.layers.Conv2d(
+        self.linear_fuse = tf.keras.layers.Conv2D(
             filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
         )
-        self.batch_norm = tf.keras.layers.BatchNormalization(name="batch_norm")
+        self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="batch_norm")
         self.activation = tf.keras.layers.Activation("relu")
 
         self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
-        self.classifier = tf.keras.layers.Conv2d(filters=config.num_labels, kernel_size=1, name="classifier")
+        self.classifier = tf.keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier")
 
         self.config = config
 
-    def call(self, encoder_hidden_states: tf.Tensor):
-        batch_size = encoder_hidden_states[-1].shape[0]
+    def call(self, encoder_hidden_states):
+        batch_size = shape_list(encoder_hidden_states[-1])[0]
 
         all_hidden_states = ()
         for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
@@ -740,8 +725,7 @@ def call(self, encoder_hidden_states: tf.Tensor):
                 encoder_hidden_state, size=shape_list(encoder_hidden_states[0])[1:-1], method="bilinear"
             )
             all_hidden_states += (encoder_hidden_state,)
-
-        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=1))
+        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
         hidden_states = self.batch_norm(hidden_states)
         hidden_states = self.activation(hidden_states)
         hidden_states = self.dropout(hidden_states)
@@ -796,6 +780,31 @@ def call(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, TFSemanticSegmenterOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs, training=False)
+        >>> # logits are of shape (batch_size, num_labels, height, width)
+        >>> logits = outputs.logits
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -826,7 +835,7 @@ def call(
                 output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        # make logits of shape (batch_size, num_labels, height/4, width/4) to
+        # make logits of shape (batch_size, num_labels, height, width) to
         # keep them consistent across APIs
         logits = tf.transpose(logits, perm=[0, 3, 1, 2])
         return TFSemanticSegmenterOutput(

From b994e35bf11b889a9ad2f1773d4e3a630d346c2d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 15:19:28 +0530
Subject: [PATCH 04/29] chore: remove unused variable.

---
 playground_pt.py                              | 35 +++++++++++++++++++
 playground_tf.py                              | 33 +++++++++++++++++
 .../models/segformer/modeling_tf_segformer.py |  1 -
 3 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 playground_pt.py
 create mode 100644 playground_tf.py

diff --git a/playground_pt.py b/playground_pt.py
new file mode 100644
index 000000000000..1f2368baa76b
--- /dev/null
+++ b/playground_pt.py
@@ -0,0 +1,35 @@
+
+from PIL import Image
+from src.transformers.models.segformer import SegformerFeatureExtractor, SegformerForSemanticSegmentation
+import torch 
+
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+feature_extractor = SegformerFeatureExtractor(
+    image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
+)
+model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+print(all([x.momentum is not None for x in model.modules() if isinstance(x, torch.nn.BatchNorm2d)]))
+
+image = prepare_img()
+encoded_inputs = feature_extractor(images=image, return_tensors="pt")
+pixel_values = encoded_inputs.pixel_values
+print(pixel_values.size())
+
+with torch.no_grad():
+    outputs = model(pixel_values)
+
+expected_shape = torch.Size((1, model.config.num_labels, 128, 128))
+print(outputs.logits.shape == expected_shape)
+
+expected_slice = torch.tensor(
+    [
+        [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
+        [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
+        [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
+    ]
+)
+print(torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/playground_tf.py b/playground_tf.py
new file mode 100644
index 000000000000..4de9d12be679
--- /dev/null
+++ b/playground_tf.py
@@ -0,0 +1,33 @@
+
+from PIL import Image
+from src.transformers.models.segformer import SegformerFeatureExtractor
+from src.transformers.models.segformer.modeling_tf_segformer import TFSegformerForSemanticSegmentation
+import tensorflow as tf
+import numpy as np
+
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+feature_extractor = SegformerFeatureExtractor(
+    image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
+)
+model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512", from_pt=True)
+image = prepare_img()
+encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+pixel_values = encoded_inputs.pixel_values
+
+outputs = model(pixel_values, training=False)
+
+expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
+print(outputs.logits.shape == expected_shape)
+
+expected_slice = np.array(
+    [
+        [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
+        [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
+        [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
+    ]
+)
+print(np.testing.assert_allclose(outputs.logits[0, :3, :3, :3].numpy(), expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index fd2124a042d4..248f08e841b4 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -147,7 +147,6 @@ def call(
         training: bool = False,
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
         batch_size = shape_list(hidden_states)[0]
-        seq_len = shape_list(hidden_states)[1]
         num_channels = shape_list(hidden_states)[2]
 
         query_layer = self.transpose_for_scores(self.query(hidden_states))

From d472c3875be529810eca215cbe3cab3c5920ce25 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 21:42:29 +0530
Subject: [PATCH 05/29] add test, remaining modifications.

---
 docs/source/en/index.mdx                      |   2 +-
 docs/source/en/model_doc/segformer.mdx        |  27 +-
 src/transformers/__init__.py                  |  18 +
 .../models/auto/modeling_tf_auto.py           |   3 +
 src/transformers/models/segformer/__init__.py |  38 +-
 .../models/segformer/modeling_tf_segformer.py |  63 ++-
 src/transformers/utils/dummy_tf_objects.py    |  31 ++
 .../segformer/test_modeling_tf_segformer.py   | 454 ++++++++++++++++++
 utils/documentation_tests.txt                 |   1 +
 9 files changed, 615 insertions(+), 22 deletions(-)
 create mode 100644 tests/models/segformer/test_modeling_tf_segformer.py

diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx
index 3f968621db0a..827d9bf089c1 100644
--- a/docs/source/en/index.mdx
+++ b/docs/source/en/index.mdx
@@ -271,7 +271,7 @@ Flax), PyTorch, and/or TensorFlow.
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx
index 9563e0843073..30cab88935b2 100644
--- a/docs/source/en/model_doc/segformer.mdx
+++ b/docs/source/en/model_doc/segformer.mdx
@@ -36,13 +36,14 @@ The figure below illustrates the architecture of SegFormer. Taken from the [orig
 
 <img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png"/>
 
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/NVlabs/SegFormer).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version 
+of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer).
 
 Tips:
 
-- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head.
+- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head.
   [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to
-  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decode head on
+  as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on
   top to perform semantic segmentation of images. In addition, there's
   [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The
   authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
@@ -104,3 +105,23 @@ Tips:
 
 [[autodoc]] SegformerForSemanticSegmentation
     - forward
+
+## TFSegformerDecodeHead
+
+[[autodoc]] TFSegformerDecodeHead
+    - call
+
+## TFSegformerModel
+
+[[autodoc]] TFSegformerModel
+    - call 
+
+## TFSegformerForImageClassification
+
+[[autodoc]] TFSegformerForImageClassification
+    - call 
+
+## TFSegformerForSemanticSegmentation
+
+[[autodoc]] TFSegformerForSemanticSegmentation
+    - call 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 32f865a84756..da94628c3080 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2361,6 +2361,16 @@
             "TFRoFormerPreTrainedModel",
         ]
     )
+    _import_structure["models.segformer"].extend(
+        [
+            "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFSegformerDecodeHead",
+            "TFSegformerForImageClassification",
+            "TFSegformerForSemanticSegmentation",
+            "TFSegformerModel",
+            "TFSegformerPreTrainedModel",
+        ]
+    )
     _import_structure["models.speech_to_text"].extend(
         [
             "TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -4657,6 +4667,14 @@
             TFRoFormerModel,
             TFRoFormerPreTrainedModel,
         )
+        from .models.segformer import (
+            TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSegformerDecodeHead,
+            TFSegformerForImageClassification,
+            TFSegformerForSemanticSegmentation,
+            TFSegformerModel,
+            TFSegformerPreTrainedModel,
+        )
         from .models.speech_to_text import (
             TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFSpeech2TextForConditionalGeneration,
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 9c889597e59f..bb6452ddb5e0 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -65,6 +65,7 @@
         ("rembert", "TFRemBertModel"),
         ("roberta", "TFRobertaModel"),
         ("roformer", "TFRoFormerModel"),
+        ("segformer", "TFSegformerModel"),
         ("speech_to_text", "TFSpeech2TextModel"),
         ("swin", "TFSwinModel"),
         ("t5", "TFT5Model"),
@@ -173,6 +174,7 @@
         # Model for Image-classsification
         ("convnext", "TFConvNextForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
+        ("segformer", "TFSegformerForImageClassification"),
         ("swin", "TFSwinForImageClassification"),
         ("vit", "TFViTForImageClassification"),
     ]
@@ -182,6 +184,7 @@
     [
         # Model for Semantic Segmentation mapping
         ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"),
+        ("segformer", "TFSegformerForSemanticSegmentation"),
     ]
 )
 
diff --git a/src/transformers/models/segformer/__init__.py b/src/transformers/models/segformer/__init__.py
index 1ce4ecb07a9c..2317237509a0 100644
--- a/src/transformers/models/segformer/__init__.py
+++ b/src/transformers/models/segformer/__init__.py
@@ -17,7 +17,13 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+)
 
 
 _import_structure = {"configuration_segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"]}
@@ -46,6 +52,21 @@
         "SegformerPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_segformer"] = [
+        "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFSegformerDecodeHead",
+        "TFSegformerForImageClassification",
+        "TFSegformerForSemanticSegmentation",
+        "TFSegformerModel",
+        "TFSegformerPreTrainedModel",
+    ]
+
 
 if TYPE_CHECKING:
     from .configuration_segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
@@ -73,7 +94,20 @@
             SegformerModel,
             SegformerPreTrainedModel,
         )
-
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_segformer import (
+            TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSegformerDecodeHead,
+            TFSegformerForImageClassification,
+            TFSegformerForSemanticSegmentation,
+            TFSegformerModel,
+            TFSegformerPreTrainedModel,
+        )
 
 else:
     import sys
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 248f08e841b4..e908f4575d42 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -102,7 +102,14 @@ class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer):
     """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
     paper](https://arxiv.org/abs/2102.12122)."""
 
-    def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, **kwargs):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.hidden_size = hidden_size
         self.num_attention_heads = num_attention_heads
@@ -187,7 +194,7 @@ def call(
 
 
 class TFSegformerSelfOutput(tf.keras.layers.Layer):
-    def __init__(self, config, hidden_size: int, **kwargs):
+    def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
         super().__init__(**kwargs)
         self.dense = tf.keras.layers.Dense(hidden_size, name="dense")
         self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
@@ -199,7 +206,14 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
 
 
 class TFSegformerAttention(tf.keras.layers.Layer):
-    def __init__(self, config, hidden_size: int, num_attention_heads: int, sequence_reduction_ratio: int, **kwargs):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.self = TFSegformerEfficientSelfAttention(
             config=config,
@@ -241,7 +255,14 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
 
 
 class TFSegformerMixFFN(tf.keras.layers.Layer):
-    def __init__(self, config, in_features: int, hidden_features: int = None, out_features: int = None, **kwargs):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        in_features: int,
+        hidden_features: int = None,
+        out_features: int = None,
+        **kwargs
+    ):
         super().__init__(**kwargs)
         out_features = out_features or in_features
         self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1")
@@ -325,7 +346,7 @@ def call(
 
 
 class TFSegformerEncoder(tf.keras.layers.Layer):
-    def __init__(self, config, **kwargs):
+    def __init__(self, config: SegformerConfig, **kwargs):
         super().__init__(**kwargs)
         self.config = config
 
@@ -468,13 +489,19 @@ def call(
             training=training,
         )
         sequence_output = encoder_outputs[0]
+        # Change to NCHW output format have uniformity in the modules
+        sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
+
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
 
         if not return_dict:
             return (sequence_output,) + encoder_outputs[1:]
 
         return TFBaseModelOutput(
             last_hidden_state=sequence_output,
-            hidden_states=encoder_outputs.hidden_states,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
         )
 
@@ -497,8 +524,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        # (todo: sayakpaul): change the batch size to 3
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(1, self.config.num_channels, 512, 512), dtype=tf.float32)
+        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32)
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
     @tf.function(
@@ -531,7 +557,7 @@ def serving(self, inputs):
     Parameters:
         config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 SEGFORMER_INPUTS_DOCSTRING = r"""
@@ -552,8 +578,8 @@ def serving(self, inputs):
             used instead.
 
         return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This argument can be used
-            in eager mode, in graph mode the value will always be set to True.
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
 
         training (`bool`, *optional*, defaults to `False``):
             Whether or not to use the model in training mode (some modules like dropout modules have different
@@ -609,7 +635,7 @@ def call(
     SEGFORMER_START_DOCSTRING,
 )
 class TFSegformerForImageClassification(TFSegformerPreTrainedModel):
-    def __init__(self, config, *inputs, **kwargs):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
         self.num_labels = config.num_labels
@@ -646,6 +672,7 @@ def call(
 
         # convert last hidden states to (batch_size, height*width, hidden_size)
         batch_size = shape_list(sequence_output)[0]
+        sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1])
         sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))
 
         # global average pooling
@@ -683,7 +710,7 @@ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
 
 
 class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
-    def __init__(self, config, **kwargs):
+    def __init__(self, config: SegformerConfig, **kwargs):
         super().__init__(config, **kwargs)
         # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
         mlps = []
@@ -710,20 +737,24 @@ def call(self, encoder_hidden_states):
         all_hidden_states = ()
         for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
             if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
-                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[-1], tf.float32))
+                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
                 height = width = tf.cast(height, tf.int32)
                 encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
 
             # unify channel dimension
+            encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
             height = shape_list(encoder_hidden_state)[1]
             width = shape_list(encoder_hidden_state)[2]
             encoder_hidden_state = mlp(encoder_hidden_state)
             encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1))
+
             # upsample
+            temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
             encoder_hidden_state = tf.image.resize(
-                encoder_hidden_state, size=shape_list(encoder_hidden_states[0])[1:-1], method="bilinear"
+                encoder_hidden_state, size=shape_list(temp_state)[1:-1], method="bilinear"
             )
             all_hidden_states += (encoder_hidden_state,)
+
         hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
         hidden_states = self.batch_norm(hidden_states)
         hidden_states = self.activation(hidden_states)
@@ -740,7 +771,7 @@ def call(self, encoder_hidden_states):
     SEGFORMER_START_DOCSTRING,
 )
 class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
-    def __init__(self, config, **kwargs):
+    def __init__(self, config: SegformerConfig, **kwargs):
         super().__init__(config, **kwargs)
         self.segformer = TFSegformerMainLayer(config, name="segformer")
         self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 4eb40113e76c..9608c1e9993a 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1918,6 +1918,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFSegformerForImageClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSegformerModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSegformerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFSegformerForSemanticSegmentation(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
new file mode 100644
index 000000000000..2ea9c2a15383
--- /dev/null
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -0,0 +1,454 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the TensorFlow SegFormer model. """
+
+import inspect
+import unittest
+
+import numpy as np
+
+from transformers.file_utils import is_tf_available, is_vision_available
+from transformers.testing_utils import require_tf, slow
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import (
+        SegformerConfig,
+        TFSegformerForImageClassification,
+        TFSegformerForSemanticSegmentation,
+        TFSegformerModel,
+    )
+    from transformers.models.segformer.modeling_tf_segformer import TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import SegformerFeatureExtractor
+
+
+class TFSegformerConfigTester(ConfigTester):
+    def create_and_test_config_common_properties(self):
+        config = self.config_class(**self.inputs_dict)
+        self.parent.assertTrue(hasattr(config, "hidden_sizes"))
+        self.parent.assertTrue(hasattr(config, "num_attention_heads"))
+        self.parent.assertTrue(hasattr(config, "num_encoder_blocks"))
+
+
+class TFSegformerModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        image_size=64,
+        num_channels=3,
+        num_encoder_blocks=4,
+        depths=[2, 2, 2, 2],
+        sr_ratios=[8, 4, 2, 1],
+        hidden_sizes=[16, 32, 64, 128],
+        downsampling_rates=[1, 4, 8, 16],
+        num_attention_heads=[1, 2, 4, 8],
+        is_training=True,
+        use_labels=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        num_labels=3,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.num_encoder_blocks = num_encoder_blocks
+        self.sr_ratios = sr_ratios
+        self.depths = depths
+        self.hidden_sizes = hidden_sizes
+        self.downsampling_rates = downsampling_rates
+        self.num_attention_heads = num_attention_heads
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
+
+        labels = None
+        if self.use_labels:
+            labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
+
+        config = self.get_config()
+        return config, pixel_values, labels
+
+    def get_config(self):
+        return SegformerConfig(
+            image_size=self.image_size,
+            num_channels=self.num_channels,
+            num_encoder_blocks=self.num_encoder_blocks,
+            depths=self.depths,
+            hidden_sizes=self.hidden_sizes,
+            num_attention_heads=self.num_attention_heads,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+        )
+
+    def create_and_check_model(self, config, pixel_values, labels):
+        model = TFSegformerModel(config=config)
+        result = model(pixel_values, training=False)
+        expected_height = expected_width = self.image_size // (self.downsampling_rates[-1] * 2)
+        self.parent.assertEqual(
+            result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width)
+        )
+
+    def create_and_check_for_image_segmentation(self, config, pixel_values, labels):
+        config.num_labels = self.num_labels
+        model = TFSegformerForSemanticSegmentation(config)
+        result = model(pixel_values, training=False)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
+        )
+        result = model(pixel_values, labels=labels, training=False)
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4)
+        )
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, labels = config_and_inputs
+        inputs_dict = {"pixel_values": pixel_values}
+        return config, inputs_dict
+
+
+@require_tf
+class TFSegformerModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (TFSegformerModel, TFSegformerForImageClassification, TFSegformerForSemanticSegmentation)
+        if is_tf_available()
+        else ()
+    )
+
+    test_head_masking = False
+    test_onnx = False
+    test_pruning = False
+    test_resize_embeddings = False
+
+    def setUp(self):
+        self.model_tester = TFSegformerModelTester(self)
+        self.config_tester = TFSegformerConfigTester(self, config_class=SegformerConfig, has_text_modality=False)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip("SegFormer does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip("Test was written for TF 1.x and isn't really relevant here")
+    def test_compile_tf_model(self):
+        pass
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.call)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = ["pixel_values"]
+            self.assertListEqual(arg_names[:1], expected_arg_names)
+
+    def test_attention_outputs(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+
+            expected_num_attentions = sum(self.model_tester.depths)
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.attentions
+
+            self.assertEqual(len(attentions), expected_num_attentions)
+
+            # verify the first attentions (first block, first layer)
+            expected_seq_len = (self.model_tester.image_size // 4) ** 2
+            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
+            )
+
+            # verify the last attentions (last block, last layer)
+            expected_seq_len = (self.model_tester.image_size // 32) ** 2
+            expected_reduced_seq_len = (self.model_tester.image_size // (32 * self.model_tester.sr_ratios[-1])) ** 2
+            self.assertListEqual(
+                list(attentions[-1].shape[-3:]),
+                [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len],
+            )
+            out_len = len(outputs)
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            self.assertEqual(out_len + 1, len(outputs))
+
+            self_attentions = outputs.attentions
+
+            self.assertEqual(len(self_attentions), expected_num_attentions)
+            # verify the first attentions (first block, first layer)
+            expected_seq_len = (self.model_tester.image_size // 4) ** 2
+            expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len],
+            )
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+
+            outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = self.model_tester.num_encoder_blocks
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            # verify the first hidden states (first block)
+            self.assertListEqual(
+                list(hidden_states[0].shape[-3:]),
+                [
+                    self.model_tester.hidden_sizes[0],
+                    self.model_tester.image_size // 4,
+                    self.model_tester.image_size // 4,
+                ],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    # Overriding this method since the base method won't be compatible with Segformer.
+    def test_keras_fit(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            # Since `TFSegformerModel` cannot operate with the default `fit()` method.
+            if model_class.__name__ != "TFSegformerModel":
+                model = model_class(config)
+                if getattr(model, "hf_compute_loss", None):
+                    # Test that model correctly compute the loss with kwargs
+                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
+
+                    label_names = {"labels"}
+                    self.assertGreater(len(label_names), 0, msg="No matching label names found!")
+                    labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
+                    inputs_minus_labels = {
+                        key: val for key, val in prepared_for_class.items() if key not in label_names
+                    }
+                    self.assertGreater(len(inputs_minus_labels), 0)
+                    model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True)
+
+                    # Make sure the model fits without crashing regardless of where we pass the labels
+                    history1 = model.fit(
+                        prepared_for_class,
+                        validation_data=prepared_for_class,
+                        steps_per_epoch=1,
+                        validation_steps=1,
+                        shuffle=False,
+                    )
+                    val_loss1 = history1.history["val_loss"][0]
+                    history2 = model.fit(
+                        inputs_minus_labels,
+                        labels,
+                        validation_data=(inputs_minus_labels, labels),
+                        steps_per_epoch=1,
+                        validation_steps=1,
+                        shuffle=False,
+                    )
+                    val_loss2 = history2.history["val_loss"][0]
+                    self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
+
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+
+    # Overriding this method since the base method won't be compatible with Segformer.
+    def test_loss_computation(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            # Since `TFSegformerModel` won't have labels against which we
+            # could compute loss.
+            if model_class.__name__ != "TFSegformerModel":
+                model = model_class(config)
+                if getattr(model, "hf_compute_loss", None):
+                    # The number of elements in the loss should be the same as the number of elements in the label
+                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
+                    added_label = prepared_for_class[
+                        sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+                    ]
+                    loss_size = tf.size(added_label)
+
+                    # Test that model correctly compute the loss with kwargs
+                    possible_input_names = {"input_ids", "pixel_values", "input_features"}
+                    input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
+                    model_input = prepared_for_class.pop(input_name)
+
+                    loss = model(model_input, **prepared_for_class)[0]
+                    self.assertEqual(loss.shape, [loss_size])
+
+                    # Test that model correctly compute the loss with a dict
+                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
+                    loss = model(**prepared_for_class)[0]
+                    self.assertEqual(loss.shape, [loss_size])
+
+                    # Test that model correctly compute the loss with a tuple
+                    label_keys = prepared_for_class.keys() - inputs_dict.keys()
+                    signature = inspect.signature(model.call).parameters
+                    signature_names = list(signature.keys())
+
+                    # Create a dictionary holding the location of the tensors in the tuple
+                    tuple_index_mapping = {0: input_name}
+                    for label_key in label_keys:
+                        label_key_index = signature_names.index(label_key)
+                        tuple_index_mapping[label_key_index] = label_key
+                    sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+                    # Initialize a list with their default values, update the values and convert to a tuple
+                    list_input = []
+
+                    for name in signature_names:
+                        if name != "kwargs":
+                            list_input.append(signature[name].default)
+
+                    for index, value in sorted_tuple_index_mapping:
+                        list_input[index] = prepared_for_class[value]
+
+                    tuple_input = tuple(list_input)
+
+                    # Send to model
+                    loss = model(tuple_input[:-1])[0]
+
+                    self.assertEqual(loss.shape, [loss_size])
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = TFSegformerModel.from_pretrained(model_name, from_pt=True)
+            self.assertIsNotNone(model)
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_tf
+class TFSegformerModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_image_segmentation_ade(self):
+        # only resize + normalize
+        feature_extractor = SegformerFeatureExtractor(
+            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
+        )
+        model = TFSegformerForSemanticSegmentation.from_pretrained(
+            "nvidia/segformer-b0-finetuned-ade-512-512", from_pt=True
+        )
+
+        image = prepare_img()
+        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        pixel_values = encoded_inputs.pixel_values
+
+        outputs = model(pixel_values, training=False)
+
+        expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
+                [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
+                [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
+            ]
+        )
+        tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4)
+
+    @slow
+    def test_inference_image_segmentation_city(self):
+        # only resize + normalize
+        feature_extractor = SegformerFeatureExtractor(
+            image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
+        )
+        model = TFSegformerForSemanticSegmentation.from_pretrained(
+            "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", from_pt=True
+        )
+
+        image = prepare_img()
+        encoded_inputs = feature_extractor(images=image, return_tensors="tf")
+        pixel_values = encoded_inputs.pixel_values
+
+        outputs = model(pixel_values, training=False)
+
+        expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
+        self.assertEqual(outputs.logits.shape, expected_shape)
+
+        expected_slice = tf.constant(
+            [
+                [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]],
+                [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]],
+                [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]],
+            ]
+        )
+        tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1)
diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index 6297e328912c..a006d9ba35da 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -59,6 +59,7 @@ src/transformers/models/sew_d/modeling_sew_d.py
 src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
 src/transformers/models/speech_to_text/modeling_speech_to_text.py
 src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+src/transformers/models/swin/modeling_tf_segformer.py
 src/transformers/models/swin/modeling_swin.py
 src/transformers/models/trocr/modeling_trocr.py
 src/transformers/models/unispeech/modeling_unispeech.py

From 22db5d0a3f58695fec2ac74cb4e9775c33a5d479 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 28 Jun 2022 21:43:28 +0530
Subject: [PATCH 06/29] remove: unnecessary files.

---
 playground_pt.py | 35 -----------------------------------
 playground_tf.py | 33 ---------------------------------
 2 files changed, 68 deletions(-)
 delete mode 100644 playground_pt.py
 delete mode 100644 playground_tf.py

diff --git a/playground_pt.py b/playground_pt.py
deleted file mode 100644
index 1f2368baa76b..000000000000
--- a/playground_pt.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-from PIL import Image
-from src.transformers.models.segformer import SegformerFeatureExtractor, SegformerForSemanticSegmentation
-import torch 
-
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-feature_extractor = SegformerFeatureExtractor(
-    image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-)
-model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-print(all([x.momentum is not None for x in model.modules() if isinstance(x, torch.nn.BatchNorm2d)]))
-
-image = prepare_img()
-encoded_inputs = feature_extractor(images=image, return_tensors="pt")
-pixel_values = encoded_inputs.pixel_values
-print(pixel_values.size())
-
-with torch.no_grad():
-    outputs = model(pixel_values)
-
-expected_shape = torch.Size((1, model.config.num_labels, 128, 128))
-print(outputs.logits.shape == expected_shape)
-
-expected_slice = torch.tensor(
-    [
-        [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-        [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-        [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-    ]
-)
-print(torch.allclose(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4))
\ No newline at end of file
diff --git a/playground_tf.py b/playground_tf.py
deleted file mode 100644
index 4de9d12be679..000000000000
--- a/playground_tf.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-from PIL import Image
-from src.transformers.models.segformer import SegformerFeatureExtractor
-from src.transformers.models.segformer.modeling_tf_segformer import TFSegformerForSemanticSegmentation
-import tensorflow as tf
-import numpy as np
-
-def prepare_img():
-    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
-    return image
-
-
-feature_extractor = SegformerFeatureExtractor(
-    image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
-)
-model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512", from_pt=True)
-image = prepare_img()
-encoded_inputs = feature_extractor(images=image, return_tensors="tf")
-pixel_values = encoded_inputs.pixel_values
-
-outputs = model(pixel_values, training=False)
-
-expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128))
-print(outputs.logits.shape == expected_shape)
-
-expected_slice = np.array(
-    [
-        [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]],
-        [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]],
-        [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]],
-    ]
-)
-print(np.testing.assert_allclose(outputs.logits[0, :3, :3, :3].numpy(), expected_slice, atol=1e-4))
\ No newline at end of file

From 90629abb2a7a3f6a9b4b18bef4d7dc8fe579ad95 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:13:06 +0530
Subject: [PATCH 07/29] add: rest of the files.

Co-authored-by: matt <rocketknight1@gmail.com>
---
 .../models/segformer/modeling_tf_segformer.py |  44 +--
 .../segformer/test_modeling_tf_segformer.py   | 265 ++++++++++++------
 2 files changed, 203 insertions(+), 106 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index e908f4575d42..bee31d3b3679 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -27,7 +27,7 @@
     replace_return_docstrings,
 )
 from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
-from ...modeling_tf_utils import TFPreTrainedModel, keras_serializable, unpack_inputs
+from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs
 from ...tf_utils import shape_list, stable_softmax
 from ...utils import logging
 from .configuration_segformer import SegformerConfig
@@ -249,8 +249,8 @@ def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
 
         new_height = shape_list(hidden_states)[1]
         new_width = shape_list(hidden_states)[2]
-        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, -1))
-
+        num_channels = shape_list(hidden_states)[3]
+        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
         return hidden_states
 
 
@@ -333,7 +333,6 @@ def call(
         # first residual connection (with stochastic depth)
         attention_output = self.drop_path(attention_output, training=training)
         hidden_states = attention_output + hidden_states
-
         mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
 
         # second residual connection (with stochastic depth)
@@ -413,7 +412,7 @@ def call(
 
         hidden_states = pixel_values
         for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
-            embedding_layer, block_layer, norm_layer = x  # all of these are lists
+            embedding_layer, block_layer, norm_layer = x
             # first, obtain patch embeddings
             hidden_states, height, width = embedding_layer(hidden_states)
 
@@ -436,7 +435,8 @@ def call(
 
             # fourth, optionally reshape back to (batch_size, height, width, num_channels)
             if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
-                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, -1))
+                num_channels = shape_list(hidden_states)[-1]
+                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
 
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -497,7 +497,11 @@ def call(
             hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]])
 
         if not return_dict:
-            return (sequence_output,) + encoder_outputs[1:]
+            if tf.greater(len(encoder_outputs[1:]), 0):
+                transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
+                return (sequence_output,) + (transposed_encoder_outputs,)
+            else:
+                return (sequence_output,) + encoder_outputs[1:]
 
         return TFBaseModelOutput(
             last_hidden_state=sequence_output,
@@ -524,7 +528,8 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32)
+        # todo: change the batch size back to 3 (sayakpaul).
+        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(1, self.config.num_channels, 512, 512), dtype=tf.float32)
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
     @tf.function(
@@ -634,7 +639,7 @@ def call(
     """,
     SEGFORMER_START_DOCSTRING,
 )
-class TFSegformerForImageClassification(TFSegformerPreTrainedModel):
+class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss):
     def __init__(self, config: SegformerConfig, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -701,9 +706,9 @@ def __init__(self, config: SegformerConfig, **kwargs):
         self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj")
 
     def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
-        hidden_dim = shape_list(hidden_states)[-1]
         height = shape_list(hidden_states)[1]
         width = shape_list(hidden_states)[2]
+        hidden_dim = shape_list(hidden_states)[-1]
         hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim))
         hidden_states = self.proj(hidden_states)
         return hidden_states
@@ -715,7 +720,7 @@ def __init__(self, config: SegformerConfig, **kwargs):
         # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
         mlps = []
         for i in range(config.num_encoder_blocks):
-            mlp = TFSegformerMLP(config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
+            mlp = TFSegformerMLP(config, name=f"linear_c.{i}")
             mlps.append(mlp)
         self.mlps = mlps
 
@@ -750,9 +755,8 @@ def call(self, encoder_hidden_states):
 
             # upsample
             temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
-            encoder_hidden_state = tf.image.resize(
-                encoder_hidden_state, size=shape_list(temp_state)[1:-1], method="bilinear"
-            )
+            upsample_resolution = shape_list(temp_state)[1:-1]
+            encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear")
             all_hidden_states += (encoder_hidden_state,)
 
         hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
@@ -776,7 +780,7 @@ def __init__(self, config: SegformerConfig, **kwargs):
         self.segformer = TFSegformerMainLayer(config, name="segformer")
         self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
 
-    def compute_loss(self, logits, labels):
+    def hf_compute_loss(self, logits, labels):
         # upsample logits to the images' original size
         if len(shape_list(labels)) > 3:
             label_interp_shape = shape_list(labels)[1:-1]
@@ -787,15 +791,14 @@ def compute_loss(self, logits, labels):
         # compute weighted loss
         loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
 
-        # Taken from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
+        # Adapted from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
         # Utility to mask the index to ignore during computing the loss.
         def masked_loss(real, pred):
             mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
             loss_ = loss_fct(real, pred)
             mask = tf.cast(mask, dtype=loss_.dtype)
             loss_ *= mask
-
-            return tf.reduce_sum(loss_) / tf.reduce_sum(mask)
+            return loss_  # No reduction since other HF losses don't do it.
 
         return masked_loss(labels, upsampled_logits)
 
@@ -813,7 +816,8 @@ def call(
         r"""
         labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
             Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+            config.num_labels - 1]`. If `config.num_labels > 1`, a (per-pixel) classification loss is computed
+            (Cross-Entropy).
 
         Returns:
 
@@ -856,7 +860,7 @@ def call(
             if self.config.num_labels == 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
-                loss = self.compute_loss(logits, labels)
+                loss = self.hf_compute_loss(logits, labels)
 
         if not return_dict:
             if output_hidden_states:
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 2ea9c2a15383..1de6bae874c6 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -16,6 +16,7 @@
 
 import inspect
 import unittest
+from typing import List, Tuple
 
 import numpy as np
 
@@ -114,6 +115,7 @@ def get_config(self):
             hidden_dropout_prob=self.hidden_dropout_prob,
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             initializer_range=self.initializer_range,
+            num_labels=self.num_labels,
         )
 
     def create_and_check_model(self, config, pixel_values, labels):
@@ -142,6 +144,15 @@ def prepare_config_and_inputs_for_common(self):
         inputs_dict = {"pixel_values": pixel_values}
         return config, inputs_dict
 
+    def prepare_config_and_inputs_for_keras_fit(self, for_segmentation: bool = False):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values, seg_labels = config_and_inputs
+        if for_segmentation:
+            inputs_dict = {"pixel_values": pixel_values, "labels": seg_labels}
+        else:
+            inputs_dict = {"pixel_values": pixel_values, "labels": tf.zeros((self.batch_size))}
+        return config, inputs_dict
+
 
 @require_tf
 class TFSegformerModelTest(TFModelTesterMixin, unittest.TestCase):
@@ -281,106 +292,188 @@ def check_hidden_states_output(inputs_dict, config, model_class):
 
             check_hidden_states_output(inputs_dict, config, model_class)
 
-    # Overriding this method since the base method won't be compatible with Segformer.
+    def test_model_outputs_equivalence(self):
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs)
+            dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple()
+
+            def recursive_check(tuple_object, dict_object):
+                if isinstance(tuple_object, (List, Tuple)):
+                    for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                        recursive_check(tuple_iterable_value, dict_iterable_value)
+                elif tuple_object is None:
+                    return
+                else:
+                    self.assertTrue(
+                        all(tf.equal(tuple_object, dict_object)),
+                        msg=(
+                            "Tuple and dict output are not equal. Difference:"
+                            f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}"
+                        ),
+                    )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
+
+            if self.has_attentions:
+                tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+                dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+                check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True})
+
+            # todo: incorporate label support for semantic segmentation in `test_modeling_tf_common.py`.
+
+    def test_dataset_conversion(self):
+        gpus = tf.config.list_physical_devices("GPU")
+        # Grouped convs aren't supported on CPUs for backprop.
+        if len(gpus) >= 1:
+            super().test_dataset_conversion()
+
     def test_keras_fit(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+        gpus = tf.config.list_physical_devices("GPU")
+
+        def apply(model):
+            if getattr(model, "hf_compute_loss", None):
+                model_weights = model.get_weights()
+
+                # Test that model correctly compute the loss with kwargs
+                for_segmentation = True if model_class.__name__ == "TFSegformerForSemanticSegmentation" else False
+                _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
+                    for_segmentation=for_segmentation
+                )
+
+                label_names = {"labels"}
+                self.assertGreater(len(label_names), 0, msg="No matching label names found!")
+                labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
+                inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names}
+                self.assertGreater(len(inputs_minus_labels), 0)
+                model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True)
+
+                # Make sure the model fits without crashing regardless of where we pass the labels
+                history1 = model.fit(
+                    prepared_for_class,
+                    validation_data=prepared_for_class,
+                    steps_per_epoch=1,
+                    validation_steps=1,
+                    shuffle=False,
+                )
+                val_loss1 = history1.history["val_loss"][0]
+
+                # We reinitialize the model here even though our learning rate was zero
+                # because BatchNorm updates weights by means other than gradient descent.
+                model.set_weights(model_weights)
+                history2 = model.fit(
+                    inputs_minus_labels,
+                    labels,
+                    validation_data=(inputs_minus_labels, labels),
+                    steps_per_epoch=1,
+                    validation_steps=1,
+                    shuffle=False,
+                )
+                val_loss2 = history2.history["val_loss"][0]
+                self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
+
         for model_class in self.all_model_classes:
             # Since `TFSegformerModel` cannot operate with the default `fit()` method.
             if model_class.__name__ != "TFSegformerModel":
+                # Grouped convs and backprop with them isn't supported on CPUs.
                 model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    # Test that model correctly compute the loss with kwargs
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-
-                    label_names = {"labels"}
-                    self.assertGreater(len(label_names), 0, msg="No matching label names found!")
-                    labels = {key: val for key, val in prepared_for_class.items() if key in label_names}
-                    inputs_minus_labels = {
-                        key: val for key, val in prepared_for_class.items() if key not in label_names
-                    }
-                    self.assertGreater(len(inputs_minus_labels), 0)
-                    model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True)
-
-                    # Make sure the model fits without crashing regardless of where we pass the labels
-                    history1 = model.fit(
-                        prepared_for_class,
-                        validation_data=prepared_for_class,
-                        steps_per_epoch=1,
-                        validation_steps=1,
-                        shuffle=False,
-                    )
-                    val_loss1 = history1.history["val_loss"][0]
-                    history2 = model.fit(
-                        inputs_minus_labels,
-                        labels,
-                        validation_data=(inputs_minus_labels, labels),
-                        steps_per_epoch=1,
-                        validation_steps=1,
-                        shuffle=False,
-                    )
-                    val_loss2 = history2.history["val_loss"][0]
-                    self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3))
-
-    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
-        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
-        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
+                if len(gpus) > 1:
+                    apply(model)
 
-    # Overriding this method since the base method won't be compatible with Segformer.
     def test_loss_computation(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def apply(model):
+            for_segmentation = True if model_class.__name__ == "TFSegformerForSemanticSegmentation" else False
+            # The number of elements in the loss should be the same as the number of elements in the label
+            _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
+                for_segmentation=for_segmentation
+            )
+            added_label = prepared_for_class[
+                sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
+            ]
+            loss_size = tf.size(added_label)
+
+            # Test that model correctly compute the loss with kwargs
+            possible_input_names = {"input_ids", "pixel_values", "input_features"}
+            input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
+            model_input = prepared_for_class.pop(input_name)
+
+            loss = model(model_input, **prepared_for_class)[0]
+
+            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
+                # Segmentation loss is non-reduced. This means if the labels array
+                # has a shape of (batch_size, height, width) then the loss will
+                # also have the same shape. So, we compare the loss sizes directly.
+                self.assertEqual(tf.size(loss), loss_size)
+            else:
+                self.assertEqual(loss.shape, [loss_size])
+
+            # Test that model correctly compute the loss with a dict
+            _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit(
+                for_segmentation=for_segmentation
+            )
+            loss = model(**prepared_for_class)[0]
+
+            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
+                self.assertEqual(tf.size(loss), loss_size)
+            else:
+                self.assertEqual(loss.shape, [loss_size])
+
+            # Test that model correctly compute the loss with a tuple
+            label_keys = prepared_for_class.keys() - inputs_dict.keys()
+            signature = inspect.signature(model.call).parameters
+            signature_names = list(signature.keys())
+
+            # Create a dictionary holding the location of the tensors in the tuple
+            tuple_index_mapping = {0: input_name}
+            for label_key in label_keys:
+                label_key_index = signature_names.index(label_key)
+                tuple_index_mapping[label_key_index] = label_key
+            sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
+            # Initialize a list with their default values, update the values and convert to a tuple
+            list_input = []
+
+            for name in signature_names:
+                if name != "kwargs":
+                    list_input.append(signature[name].default)
+
+            for index, value in sorted_tuple_index_mapping:
+                list_input[index] = prepared_for_class[value]
+
+            tuple_input = tuple(list_input)
+
+            # Send to model
+            loss = model(tuple_input[:-1])[0]
+            if model_class.__name__ == "TFSegformerForSemanticSegmentation":
+                self.assertEqual(tf.size(loss), loss_size)
+            else:
+                self.assertEqual(loss.shape, [loss_size])
+
         for model_class in self.all_model_classes:
             # Since `TFSegformerModel` won't have labels against which we
             # could compute loss.
             if model_class.__name__ != "TFSegformerModel":
                 model = model_class(config)
-                if getattr(model, "hf_compute_loss", None):
-                    # The number of elements in the loss should be the same as the number of elements in the label
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-                    added_label = prepared_for_class[
-                        sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0]
-                    ]
-                    loss_size = tf.size(added_label)
-
-                    # Test that model correctly compute the loss with kwargs
-                    possible_input_names = {"input_ids", "pixel_values", "input_features"}
-                    input_name = possible_input_names.intersection(set(prepared_for_class)).pop()
-                    model_input = prepared_for_class.pop(input_name)
-
-                    loss = model(model_input, **prepared_for_class)[0]
-                    self.assertEqual(loss.shape, [loss_size])
-
-                    # Test that model correctly compute the loss with a dict
-                    _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit()
-                    loss = model(**prepared_for_class)[0]
-                    self.assertEqual(loss.shape, [loss_size])
-
-                    # Test that model correctly compute the loss with a tuple
-                    label_keys = prepared_for_class.keys() - inputs_dict.keys()
-                    signature = inspect.signature(model.call).parameters
-                    signature_names = list(signature.keys())
-
-                    # Create a dictionary holding the location of the tensors in the tuple
-                    tuple_index_mapping = {0: input_name}
-                    for label_key in label_keys:
-                        label_key_index = signature_names.index(label_key)
-                        tuple_index_mapping[label_key_index] = label_key
-                    sorted_tuple_index_mapping = sorted(tuple_index_mapping.items())
-                    # Initialize a list with their default values, update the values and convert to a tuple
-                    list_input = []
-
-                    for name in signature_names:
-                        if name != "kwargs":
-                            list_input.append(signature[name].default)
-
-                    for index, value in sorted_tuple_index_mapping:
-                        list_input[index] = prepared_for_class[value]
-
-                    tuple_input = tuple(list_input)
-
-                    # Send to model
-                    loss = model(tuple_input[:-1])[0]
-
-                    self.assertEqual(loss.shape, [loss_size])
+                apply(model)
+
+    def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None):
+        # We override with a slightly higher tol value, as semseg models tend to diverge a bit more
+        super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes)
 
     @slow
     def test_model_from_pretrained(self):

From 8d913e6bae8b9f5d8c857c44f664aad3bdc8205e Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:24:04 +0530
Subject: [PATCH 08/29] chore: remove ModuleList comment.

---
 src/transformers/models/segformer/modeling_tf_segformer.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index bee31d3b3679..243eb5a9ec58 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -364,7 +364,6 @@ def __init__(self, config: SegformerConfig, **kwargs):
                 )
             )
         self.embeddings = embeddings
-        # self.patch_embeddings = nn.ModuleList(embeddings)
 
         # Transformer blocks
         blocks = []
@@ -388,7 +387,6 @@ def __init__(self, config: SegformerConfig, **kwargs):
                 )
             blocks.append(layers)
 
-        # self.block = nn.ModuleList(blocks)
         self.block = blocks
 
         # Layer norms

From 3152f05b85330e51e10d23875c89a1c6f443ece5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:35:14 +0530
Subject: [PATCH 09/29] chore: apply make style.

---
 src/transformers/models/auto/modeling_tf_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 3201558f00c6..89cb71ea823a 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -175,8 +175,8 @@
         # Model for Image-classsification
         ("convnext", "TFConvNextForImageClassification"),
         ("data2vec-vision", "TFData2VecVisionForImageClassification"),
-        ("segformer", "TFSegformerForImageClassification"),
         ("regnet", "TFRegNetForImageClassification"),
+        ("segformer", "TFSegformerForImageClassification"),
         ("swin", "TFSwinForImageClassification"),
         ("vit", "TFViTForImageClassification"),
     ]

From 129db920b283f20be38417cb15221536853b9a11 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:41:36 +0530
Subject: [PATCH 10/29] chore: apply make fixup-copies.

---
 src/transformers/utils/dummy_tf_objects.py | 25 ++++++++++++++--------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index cf6b8f9da149..c201862b1754 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1918,55 +1918,62 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class TFSpeech2TextForConditionalGeneration(metaclass=DummyObject):
+class TFSegformerDecodeHead(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFSpeech2TextModel(metaclass=DummyObject):
+class TFSegformerForImageClassification(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFSpeech2TextPreTrainedModel(metaclass=DummyObject):
+class TFSegformerForSemanticSegmentation(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+class TFSegformerModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
 
 
-class TFSegformerForImageClassification(metaclass=DummyObject):
+class TFSegformerPreTrainedModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFSegformerModel(metaclass=DummyObject):
+TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFSpeech2TextForConditionalGeneration(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFSegformerPreTrainedModel(metaclass=DummyObject):
+class TFSpeech2TextModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-class TFSegformerForSemanticSegmentation(metaclass=DummyObject):
+class TFSpeech2TextPreTrainedModel(metaclass=DummyObject):
     _backends = ["tf"]
 
     def __init__(self, *args, **kwargs):

From 93fafd448e52ea0867c0d5ba9b00c244e862c081 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:50:15 +0530
Subject: [PATCH 11/29] add  to check_repo.py

---
 utils/check_repo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 9905bb00544b..d6ea53b0b6b1 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -137,6 +137,7 @@
     "PerceiverForMultimodalAutoencoding",
     "PerceiverForOpticalFlow",
     "SegformerDecodeHead",
+    "TFSegformerDecodeHead",
     "FlaxBeitForMaskedImageModeling",
     "PLBartEncoder",
     "PLBartDecoder",

From eb33e0ff4b9b2cf5609e3d1ea6c78f0dc7896a3f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 13:57:11 +0530
Subject: [PATCH 12/29] add decode head to IGNORE_NON_TESTED

---
 utils/check_repo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index d6ea53b0b6b1..c084b9d9cd3f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -98,6 +98,7 @@
     "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
     "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
     "OPTDecoderWrapper",
+    "TFSegformerDecodeHead", # Not a regular model.
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't

From 48f836ffd7807a4cf9d69f5fb642ab6ae0490efc Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 30 Jun 2022 14:02:33 +0530
Subject: [PATCH 13/29] chore: run make style.

---
 utils/check_repo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index c084b9d9cd3f..90d4218bdd6e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -98,7 +98,7 @@
     "FlaxBartForCausalLM",  # Building part of bigger (tested) model.
     "FlaxBertForCausalLM",  # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM.
     "OPTDecoderWrapper",
-    "TFSegformerDecodeHead", # Not a regular model.
+    "TFSegformerDecodeHead",  # Not a regular model.
 ]
 
 # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't

From 828960de4c7671f80bb3feaa8dd300e493278963 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 1 Jul 2022 08:39:24 +0530
Subject: [PATCH 14/29] chore: PR comments.

---
 .../models/segformer/modeling_tf_segformer.py        | 12 ++++++------
 tests/models/segformer/test_modeling_segformer.py    |  3 +--
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 243eb5a9ec58..75a64254b9d4 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -138,11 +138,13 @@ def __init__(
             self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
 
     def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
-        # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        # Reshape from [batch_size, seq_length, all_head_size]
+        # to [batch_size, seq_length, num_attention_heads, attention_head_size]
         batch_size = shape_list(tensor)[0]
         tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
 
-        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size]
+        # to [batch_size, num_attention_heads, seq_length, attention_head_size]
         return tf.transpose(tensor, perm=[0, 2, 1, 3])
 
     def call(
@@ -442,9 +444,7 @@ def call(
         if not return_dict:
             return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
         return TFBaseModelOutput(
-            last_hidden_state=hidden_states,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
         )
 
 
@@ -829,7 +829,7 @@ def call(
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         >>> inputs = feature_extractor(images=image, return_tensors="tf")
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 9af59299f8ec..6a1d273f6642 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -18,7 +18,7 @@
 import inspect
 import unittest
 
-from transformers import is_torch_available, is_vision_available
+from transformers import SegformerConfig, is_torch_available, is_vision_available
 from transformers.models.auto import get_values
 from transformers.testing_utils import require_torch, slow, torch_device
 
@@ -31,7 +31,6 @@
 
     from transformers import (
         MODEL_MAPPING,
-        SegformerConfig,
         SegformerForImageClassification,
         SegformerForSemanticSegmentation,
         SegformerModel,

From 942bec147b1e27091ce997893d467e15561480dd Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 2 Jul 2022 14:46:01 +0530
Subject: [PATCH 15/29] chore: minor changes to model doc.

---
 docs/source/en/model_doc/segformer.mdx | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx
index 30cab88935b2..5dc58f6780cd 100644
--- a/docs/source/en/model_doc/segformer.mdx
+++ b/docs/source/en/model_doc/segformer.mdx
@@ -66,7 +66,8 @@ Tips:
   used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
   background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
   `False`, as loss should also be computed for the background class.
-- As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
+- As most models, SegFormer comes in different sizes, the details of which can be found in the table below
+  (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)).
 
 | **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
 | :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: |
@@ -77,6 +78,10 @@ Tips:
 | MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
 | MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
 
+Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For
+SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203).
+
+
 ## SegformerConfig
 
 [[autodoc]] SegformerConfig

From c5bf93bc50dddac878bf15e417aba6f91593ee72 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sat, 2 Jul 2022 15:48:29 +0530
Subject: [PATCH 16/29] tests: reduction across samples.

---
 .../models/segformer/modeling_tf_segformer.py        |  7 +++++--
 tests/models/segformer/test_modeling_tf_segformer.py | 12 ++++++------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 75a64254b9d4..c8840f7072c4 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -796,7 +796,10 @@ def masked_loss(real, pred):
             loss_ = loss_fct(real, pred)
             mask = tf.cast(mask, dtype=loss_.dtype)
             loss_ *= mask
-            return loss_  # No reduction since other HF losses don't do it.
+            # We return loss per-sample.
+            # Initially the loss is of shape - (batch_size, height, width), then
+            # it is reduced across the spatial resolutions per-sample.
+            return tf.reduce_sum(loss_, axis=(1, 2)) / tf.reduce_sum(mask, axis=(1, 2))
 
         return masked_loss(labels, upsampled_logits)
 
@@ -858,7 +861,7 @@ def call(
             if self.config.num_labels == 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
-                loss = self.hf_compute_loss(logits, labels)
+                loss = self.hf_compute_loss(logits=logits, labels=labels)
 
         if not return_dict:
             if output_hidden_states:
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 1de6bae874c6..55a283e83318 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -416,10 +416,10 @@ def apply(model):
             loss = model(model_input, **prepared_for_class)[0]
 
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                # Segmentation loss is non-reduced. This means if the labels array
-                # has a shape of (batch_size, height, width) then the loss will
-                # also have the same shape. So, we compare the loss sizes directly.
-                self.assertEqual(tf.size(loss), loss_size)
+                # Segmentation segmentation labels have a shape of (batch_size, height, width).
+                # But the loss is reutned per-sample wise having a shape of (batch_size).
+                # So, we perform the assertion accordingly.
+                self.assertEqual(loss.shape[0], added_label.shape[0])
             else:
                 self.assertEqual(loss.shape, [loss_size])
 
@@ -430,7 +430,7 @@ def apply(model):
             loss = model(**prepared_for_class)[0]
 
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(tf.size(loss), loss_size)
+                self.assertEqual(loss.shape[0], prepared_for_class["labels"].shape[0])
             else:
                 self.assertEqual(loss.shape, [loss_size])
 
@@ -460,7 +460,7 @@ def apply(model):
             # Send to model
             loss = model(tuple_input[:-1])[0]
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(tf.size(loss), loss_size)
+                self.assertEqual(loss.shape[0], tuple_input[1].shape[0])
             else:
                 self.assertEqual(loss.shape, [loss_size])
 

From a641451fa27b96dd21db2cce688473fdfd228e66 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Sun, 3 Jul 2022 07:21:46 +0530
Subject: [PATCH 17/29] add a note on the space.

---
 docs/source/en/model_doc/segformer.mdx | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx
index 5dc58f6780cd..b359767ce4fb 100644
--- a/docs/source/en/model_doc/segformer.mdx
+++ b/docs/source/en/model_doc/segformer.mdx
@@ -52,6 +52,8 @@ Tips:
   found on the [hub](https://huggingface.co/models?other=segformer).
 - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
   fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
+- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
+  to try out a SegFormer model on custom images.
 - SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. 
 - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps
   for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in

From a9f7ec82ebdeea42a98cf8d452c385c137f91f6f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 5 Jul 2022 09:06:40 +0530
Subject: [PATCH 18/29] sort importats.

---
 tests/models/segformer/test_modeling_tf_segformer.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 55a283e83318..8eb3ce6450cf 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 
+from transformers import SegformerConfig
 from transformers.file_utils import is_tf_available, is_vision_available
 from transformers.testing_utils import require_tf, slow
 
@@ -30,12 +31,7 @@
 if is_tf_available():
     import tensorflow as tf
 
-    from transformers import (
-        SegformerConfig,
-        TFSegformerForImageClassification,
-        TFSegformerForSemanticSegmentation,
-        TFSegformerModel,
-    )
+    from transformers import TFSegformerForImageClassification, TFSegformerForSemanticSegmentation, TFSegformerModel
     from transformers.models.segformer.modeling_tf_segformer import TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST
 
 if is_vision_available():

From d414f24141fc950f82ce04b00acc7cb1e8a76aba Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Tue, 5 Jul 2022 21:19:43 +0530
Subject: [PATCH 19/29] fix: reduction in loss computation.

---
 .../models/segformer/modeling_tf_segformer.py         |  8 ++++----
 tests/models/segformer/test_modeling_tf_segformer.py  | 11 +++++------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index c8840f7072c4..25e401932463 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -796,10 +796,10 @@ def masked_loss(real, pred):
             loss_ = loss_fct(real, pred)
             mask = tf.cast(mask, dtype=loss_.dtype)
             loss_ *= mask
-            # We return loss per-sample.
-            # Initially the loss is of shape - (batch_size, height, width), then
-            # it is reduced across the spatial resolutions per-sample.
-            return tf.reduce_sum(loss_, axis=(1, 2)) / tf.reduce_sum(mask, axis=(1, 2))
+            # Reduction strategy in the similar spirit with
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210
+            reduced_masked_loss = tf.reduce_sum(loss_) / tf.reduce_sum(mask)
+            return tf.reshape(reduced_masked_loss, (1,))
 
         return masked_loss(labels, upsampled_logits)
 
diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 8eb3ce6450cf..355d5a30c4ca 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -412,10 +412,9 @@ def apply(model):
             loss = model(model_input, **prepared_for_class)[0]
 
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                # Segmentation segmentation labels have a shape of (batch_size, height, width).
-                # But the loss is reutned per-sample wise having a shape of (batch_size).
-                # So, we perform the assertion accordingly.
-                self.assertEqual(loss.shape[0], added_label.shape[0])
+                # Semantic segmentation loss is computed similarly as
+                # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210.
+                self.assertEqual(loss.shape, (1,))
             else:
                 self.assertEqual(loss.shape, [loss_size])
 
@@ -426,7 +425,7 @@ def apply(model):
             loss = model(**prepared_for_class)[0]
 
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(loss.shape[0], prepared_for_class["labels"].shape[0])
+                self.assertEqual(loss.shape, (1,))
             else:
                 self.assertEqual(loss.shape, [loss_size])
 
@@ -456,7 +455,7 @@ def apply(model):
             # Send to model
             loss = model(tuple_input[:-1])[0]
             if model_class.__name__ == "TFSegformerForSemanticSegmentation":
-                self.assertEqual(loss.shape[0], tuple_input[1].shape[0])
+                self.assertEqual(loss.shape, (1,))
             else:
                 self.assertEqual(loss.shape, [loss_size])
 

From 4d7f5a17e9266ad4dba1eae648538186ef6b63b9 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 6 Jul 2022 21:04:25 +0530
Subject: [PATCH 20/29] chore: align loss function with that of NER.

---
 .../models/segformer/modeling_tf_segformer.py         | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 25e401932463..2cf68720ee63 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -789,16 +789,13 @@ def hf_compute_loss(self, logits, labels):
         # compute weighted loss
         loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
 
-        # Adapted from https://www.tensorflow.org/text/tutorials/transformer#loss_and_metrics.
-        # Utility to mask the index to ignore during computing the loss.
         def masked_loss(real, pred):
-            mask = tf.math.logical_not(tf.math.equal(real, self.config.semantic_loss_ignore_index))
-            loss_ = loss_fct(real, pred)
-            mask = tf.cast(mask, dtype=loss_.dtype)
-            loss_ *= mask
+            unmasked_loss = loss_fct(real, pred)
+            mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
+            unmasked_loss *= mask
             # Reduction strategy in the similar spirit with
             # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210
-            reduced_masked_loss = tf.reduce_sum(loss_) / tf.reduce_sum(mask)
+            reduced_masked_loss = tf.reduce_sum(unmasked_loss) / tf.reduce_sum(mask)
             return tf.reshape(reduced_masked_loss, (1,))
 
         return masked_loss(labels, upsampled_logits)

From ac49cef9b43da3c84d5dab8d6ae869437968255b Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 7 Jul 2022 18:36:16 +0530
Subject: [PATCH 21/29] chore: correct utils/documentation_tests.txt

Co-authored-by: Joao Gante <joaofranciscocardosogante@gmail.com>
---
 utils/documentation_tests.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt
index f097d14c63b2..65d61499e9db 100644
--- a/utils/documentation_tests.txt
+++ b/utils/documentation_tests.txt
@@ -63,7 +63,7 @@ src/transformers/models/sew_d/modeling_sew_d.py
 src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
 src/transformers/models/speech_to_text/modeling_speech_to_text.py
 src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
-src/transformers/models/swin/modeling_tf_segformer.py
+src/transformers/models/segformer/modeling_tf_segformer.py
 src/transformers/models/swin/modeling_swin.py
 src/transformers/models/trocr/modeling_trocr.py
 src/transformers/models/unispeech/modeling_unispeech.py

From ba93bb4eb90a0195821234cba72a45a2e32061cb Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 8 Jul 2022 10:16:06 +0530
Subject: [PATCH 22/29] chore: simplify the interpolation of logits in loss
 computation.

---
 src/transformers/models/segformer/modeling_tf_segformer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 2cf68720ee63..199f52be1cfc 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -780,10 +780,8 @@ def __init__(self, config: SegformerConfig, **kwargs):
 
     def hf_compute_loss(self, logits, labels):
         # upsample logits to the images' original size
-        if len(shape_list(labels)) > 3:
-            label_interp_shape = shape_list(labels)[1:-1]
-        else:
-            label_interp_shape = shape_list(labels)[-2:]
+        # `labels` is of shape (batch_size, height, width)
+        label_interp_shape = shape_list(labels)[1:]
 
         upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
         # compute weighted loss

From 6c97e8d30e5ca3705ebbb36a01fd4d3070a97091 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Fri, 8 Jul 2022 10:36:54 +0530
Subject: [PATCH 23/29] chore: return transposed logits when return_dict=False.

---
 src/transformers/models/segformer/modeling_tf_segformer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 199f52be1cfc..8bab1250cae8 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -858,6 +858,10 @@ def call(
             else:
                 loss = self.hf_compute_loss(logits=logits, labels=labels)
 
+        # make logits of shape (batch_size, num_labels, height, width) to
+        # keep them consistent across APIs
+        logits = tf.transpose(logits, perm=[0, 3, 1, 2])
+
         if not return_dict:
             if output_hidden_states:
                 output = (logits,) + outputs[1:]
@@ -865,9 +869,6 @@ def call(
                 output = (logits,) + outputs[2:]
             return ((loss,) + output) if loss is not None else output
 
-        # make logits of shape (batch_size, num_labels, height, width) to
-        # keep them consistent across APIs
-        logits = tf.transpose(logits, perm=[0, 3, 1, 2])
         return TFSemanticSegmenterOutput(
             loss=loss,
             logits=logits,

From 4c8548405671c330285d35aea7b38996f4c306a5 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Thu, 14 Jul 2022 10:46:50 +0530
Subject: [PATCH 24/29] chore: add link to the tf fine-tuning repo.

---
 docs/source/en/model_doc/segformer.mdx | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx
index b359767ce4fb..b5c07f0d858c 100644
--- a/docs/source/en/model_doc/segformer.mdx
+++ b/docs/source/en/model_doc/segformer.mdx
@@ -52,6 +52,7 @@ Tips:
   found on the [hub](https://huggingface.co/models?other=segformer).
 - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and
   fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data.
+- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning.
 - One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers)
   to try out a SegFormer model on custom images.
 - SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. 

From 4afb097a1c34cc2d4a6b6e2c82902499a529c7e0 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Mon, 18 Jul 2022 17:42:34 +0530
Subject: [PATCH 25/29] address pr comments.

---
 src/transformers/models/segformer/modeling_tf_segformer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 8bab1250cae8..657fec68e4c8 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -790,10 +790,10 @@ def hf_compute_loss(self, logits, labels):
         def masked_loss(real, pred):
             unmasked_loss = loss_fct(real, pred)
             mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
-            unmasked_loss *= mask
+            masked_loss = unmasked_loss * mask
             # Reduction strategy in the similar spirit with
             # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210
-            reduced_masked_loss = tf.reduce_sum(unmasked_loss) / tf.reduce_sum(mask)
+            reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
             return tf.reshape(reduced_masked_loss, (1,))
 
         return masked_loss(labels, upsampled_logits)

From 8dd8b46999140484264e39057dbb096bb9ab056d Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 20 Jul 2022 07:17:40 +0530
Subject: [PATCH 26/29] address niels's comments.

---
 .../models/segformer/modeling_segformer.py      |  3 ++-
 .../models/segformer/modeling_tf_segformer.py   | 17 ++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 22bf9234b2ae..ee07de3a5b2e 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -280,6 +280,7 @@ def forward(self, hidden_states, height, width, output_attentions=False):
 class SegformerDWConv(nn.Module):
     def __init__(self, dim=768):
         super().__init__()
+        # `dwconv` stands for depth-wise conv.
         self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
 
     def forward(self, hidden_states, height, width):
@@ -804,7 +805,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            if self.config.num_labels == 1:
+            if not self.config.num_labels > 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
                 # upsample logits to the images' original size
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 657fec68e4c8..58b86c56e2de 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -175,8 +175,8 @@ def call(
         # Take the dot product between "query" and "key" to get the raw attention scores.
         attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
 
-        dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
-        attention_scores = tf.divide(attention_scores, dk)
+        scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, scale)
 
         # Normalize the attention scores to probabilities.
         attention_probs = stable_softmax(logits=attention_scores, axis=-1)
@@ -224,14 +224,14 @@ def __init__(
             sequence_reduction_ratio=sequence_reduction_ratio,
             name="self",
         )
-        self.sa_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
+        self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
 
     def call(
         self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
     ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]:
         self_outputs = self.self(hidden_states, height, width, output_attentions)
 
-        attention_output = self.sa_output(self_outputs[0])
+        attention_output = self.dense_output(self_outputs[0])
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
@@ -241,7 +241,7 @@ def __init__(self, dim: int = 768, **kwargs):
         super().__init__(**kwargs)
         self.dwconv = tf.keras.layers.Conv2D(
             filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
-        )
+        )  # `dwconv` stands for depth-wise conv.
 
     def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
         batch_size = shape_list(hidden_states)[0]
@@ -487,7 +487,7 @@ def call(
             training=training,
         )
         sequence_output = encoder_outputs[0]
-        # Change to NCHW output format have uniformity in the modules
+        # Change to NCHW output format to have uniformity in the modules
         sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
 
         # Change the other hidden state outputs to NCHW as well
@@ -526,8 +526,7 @@ def dummy_inputs(self) -> Dict[str, tf.Tensor]:
         Returns:
             `Dict[str, tf.Tensor]`: The dummy inputs.
         """
-        # todo: change the batch size back to 3 (sayakpaul).
-        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(1, self.config.num_channels, 512, 512), dtype=tf.float32)
+        VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32)
         return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)}
 
     @tf.function(
@@ -853,7 +852,7 @@ def call(
 
         loss = None
         if labels is not None:
-            if self.config.num_labels == 1:
+            if not self.config.num_labels > 1:
                 raise ValueError("The number of labels should be greater than one")
             else:
                 loss = self.hf_compute_loss(logits=logits, labels=labels)

From 52affaaccb911a1f25b26a7166e6a5bad94a7c4f Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 20 Jul 2022 15:33:08 +0530
Subject: [PATCH 27/29] remove from_pt=True since tf weights are in.

---
 tests/models/segformer/test_modeling_tf_segformer.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py
index 355d5a30c4ca..fbf38fa1d6b2 100644
--- a/tests/models/segformer/test_modeling_tf_segformer.py
+++ b/tests/models/segformer/test_modeling_tf_segformer.py
@@ -473,7 +473,7 @@ def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, nam
     @slow
     def test_model_from_pretrained(self):
         for model_name in TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TFSegformerModel.from_pretrained(model_name, from_pt=True)
+            model = TFSegformerModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
@@ -491,9 +491,7 @@ def test_inference_image_segmentation_ade(self):
         feature_extractor = SegformerFeatureExtractor(
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
-        model = TFSegformerForSemanticSegmentation.from_pretrained(
-            "nvidia/segformer-b0-finetuned-ade-512-512", from_pt=True
-        )
+        model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
         image = prepare_img()
         encoded_inputs = feature_extractor(images=image, return_tensors="tf")
@@ -520,7 +518,7 @@ def test_inference_image_segmentation_city(self):
             image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False
         )
         model = TFSegformerForSemanticSegmentation.from_pretrained(
-            "nvidia/segformer-b1-finetuned-cityscapes-1024-1024", from_pt=True
+            "nvidia/segformer-b1-finetuned-cityscapes-1024-1024"
         )
 
         image = prepare_img()

From 4a41bdc93207f2f294fc9274900a8af4a11e8067 Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 20 Jul 2022 16:00:37 +0530
Subject: [PATCH 28/29] remove comment from pt model.

---
 src/transformers/models/segformer/modeling_segformer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index ee07de3a5b2e..bc40469da9b6 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -280,7 +280,6 @@ def forward(self, hidden_states, height, width, output_attentions=False):
 class SegformerDWConv(nn.Module):
     def __init__(self, dim=768):
         super().__init__()
-        # `dwconv` stands for depth-wise conv.
         self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
 
     def forward(self, hidden_states, height, width):

From 9c1584c0442a635ac73987ad485199ce6db7ea7a Mon Sep 17 00:00:00 2001
From: Sayak Paul <spsayakpaul@gmail.com>
Date: Wed, 20 Jul 2022 17:07:31 +0530
Subject: [PATCH 29/29] address niels's comments.

---
 .../models/segformer/modeling_segformer.py           |  2 ++
 .../models/segformer/modeling_tf_segformer.py        | 12 +++++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index bc40469da9b6..9d0d6878a5cb 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -785,6 +785,8 @@ def forward(
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> logits = outputs.logits  # shape (batch_size, num_labels, height, width)
+        >>> logits.shape
+        (1, 150, 128, 128)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py
index 58b86c56e2de..e64a10178bf8 100644
--- a/src/transformers/models/segformer/modeling_tf_segformer.py
+++ b/src/transformers/models/segformer/modeling_tf_segformer.py
@@ -239,15 +239,15 @@ def call(
 class TFSegformerDWConv(tf.keras.layers.Layer):
     def __init__(self, dim: int = 768, **kwargs):
         super().__init__(**kwargs)
-        self.dwconv = tf.keras.layers.Conv2D(
+        self.depthwise_convolution = tf.keras.layers.Conv2D(
             filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
-        )  # `dwconv` stands for depth-wise conv.
+        )
 
     def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
         batch_size = shape_list(hidden_states)[0]
         num_channels = shape_list(hidden_states)[-1]
         hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
-        hidden_states = self.dwconv(hidden_states)
+        hidden_states = self.depthwise_convolution(hidden_states)
 
         new_height = shape_list(hidden_states)[1]
         new_width = shape_list(hidden_states)[2]
@@ -268,7 +268,7 @@ def __init__(
         super().__init__(**kwargs)
         out_features = out_features or in_features
         self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1")
-        self.dwconv = TFSegformerDWConv(hidden_features, name="dwconv")
+        self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv")
         if isinstance(config.hidden_act, str):
             self.intermediate_act_fn = get_tf_activation(config.hidden_act)
         else:
@@ -278,7 +278,7 @@ def __init__(
 
     def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
         hidden_states = self.dense1(hidden_states)
-        hidden_states = self.dwconv(hidden_states, height, width)
+        hidden_states = self.depthwise_convolution(hidden_states, height, width)
         hidden_states = self.intermediate_act_fn(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.dense2(hidden_states)
@@ -833,6 +833,8 @@ def call(
         >>> outputs = model(**inputs, training=False)
         >>> # logits are of shape (batch_size, num_labels, height, width)
         >>> logits = outputs.logits
+        >>> logits.shape
+        (1, 150, 128, 128)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (