diff --git a/docs/source/en/index.mdx b/docs/source/en/index.mdx index d06c9df122dc..b8cd08eb0473 100644 --- a/docs/source/en/index.mdx +++ b/docs/source/en/index.mdx @@ -278,7 +278,7 @@ Flax), PyTorch, and/or TensorFlow. | RetriBERT | ✅ | ✅ | ✅ | ❌ | ❌ | | RoBERTa | ✅ | ✅ | ✅ | ✅ | ✅ | | RoFormer | ✅ | ✅ | ✅ | ✅ | ✅ | -| SegFormer | ❌ | ❌ | ✅ | ❌ | ❌ | +| SegFormer | ❌ | ❌ | ✅ | ✅ | ❌ | | SEW | ❌ | ❌ | ✅ | ❌ | ❌ | | SEW-D | ❌ | ❌ | ✅ | ❌ | ❌ | | Speech Encoder decoder | ❌ | ❌ | ✅ | ❌ | ✅ | diff --git a/docs/source/en/model_doc/segformer.mdx b/docs/source/en/model_doc/segformer.mdx index 9563e0843073..b5c07f0d858c 100644 --- a/docs/source/en/model_doc/segformer.mdx +++ b/docs/source/en/model_doc/segformer.mdx @@ -36,13 +36,14 @@ The figure below illustrates the architecture of SegFormer. Taken from the [orig -This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/NVlabs/SegFormer). +This model was contributed by [nielsr](https://huggingface.co/nielsr). The TensorFlow version +of the model was contributed by [sayakpaul](https://huggingface.co/sayakpaul). The original code can be found [here](https://github.com/NVlabs/SegFormer). Tips: -- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head. +- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head. [`SegformerModel`] is the hierarchical Transformer encoder (which in the paper is also referred to - as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decode head on + as Mix Transformer or MiT). [`SegformerForSemanticSegmentation`] adds the all-MLP decoder head on top to perform semantic segmentation of images. In addition, there's [`SegformerForImageClassification`] which can be used to - you guessed it - classify images. The authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw @@ -51,6 +52,9 @@ Tips: found on the [hub](https://huggingface.co/models?other=segformer). - The quickest way to get started with SegFormer is by checking the [example notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer) (which showcase both inference and fine-tuning on custom data). One can also check out the [blog post](https://huggingface.co/blog/fine-tune-segformer) introducing SegFormer and illustrating how it can be fine-tuned on custom data. +- TensorFlow users should refer to [this repository](https://github.com/deep-diver/segformer-tf-transformers) that shows off-the-shelf inference and fine-tuning. +- One can also check out [this interactive demo on Hugging Face Spaces](https://huggingface.co/spaces/chansung/segformer-tf-transformers) + to try out a SegFormer model on custom images. - SegFormer works on any input size, as it pads the input to be divisible by `config.patch_sizes`. - One can use [`SegformerFeatureExtractor`] to prepare images and corresponding segmentation maps for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in @@ -65,7 +69,8 @@ Tips: used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as background class and include this class as part of all labels. In that case, `reduce_labels` should be set to `False`, as loss should also be computed for the background class. -- As most models, SegFormer comes in different sizes, the details of which can be found in the table below. +- As most models, SegFormer comes in different sizes, the details of which can be found in the table below + (taken from Table 7 of the [original paper](https://arxiv.org/abs/2105.15203)). | **Model variant** | **Depths** | **Hidden sizes** | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** | | :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: | @@ -76,6 +81,10 @@ Tips: | MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 | | MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 | +Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. For +SegFormer's results on the segmentation datasets like ADE20k, refer to the [paper](https://arxiv.org/abs/2105.15203). + + ## SegformerConfig [[autodoc]] SegformerConfig @@ -104,3 +113,23 @@ Tips: [[autodoc]] SegformerForSemanticSegmentation - forward + +## TFSegformerDecodeHead + +[[autodoc]] TFSegformerDecodeHead + - call + +## TFSegformerModel + +[[autodoc]] TFSegformerModel + - call + +## TFSegformerForImageClassification + +[[autodoc]] TFSegformerForImageClassification + - call + +## TFSegformerForSemanticSegmentation + +[[autodoc]] TFSegformerForSemanticSegmentation + - call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5d6227c04a43..a3af2dc71254 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -2430,6 +2430,16 @@ "TFRoFormerPreTrainedModel", ] ) + _import_structure["models.segformer"].extend( + [ + "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFSegformerDecodeHead", + "TFSegformerForImageClassification", + "TFSegformerForSemanticSegmentation", + "TFSegformerModel", + "TFSegformerPreTrainedModel", + ] + ) _import_structure["models.speech_to_text"].extend( [ "TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -4789,6 +4799,14 @@ TFRoFormerModel, TFRoFormerPreTrainedModel, ) + from .models.segformer import ( + TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TFSegformerDecodeHead, + TFSegformerForImageClassification, + TFSegformerForSemanticSegmentation, + TFSegformerModel, + TFSegformerPreTrainedModel, + ) from .models.speech_to_text import ( TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST, TFSpeech2TextForConditionalGeneration, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 2a437f548f14..6f9b15c131d6 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -68,6 +68,7 @@ ("resnet", "TFResNetModel"), ("roberta", "TFRobertaModel"), ("roformer", "TFRoFormerModel"), + ("segformer", "TFSegformerModel"), ("speech_to_text", "TFSpeech2TextModel"), ("swin", "TFSwinModel"), ("t5", "TFT5Model"), @@ -180,6 +181,7 @@ ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")), ("regnet", "TFRegNetForImageClassification"), ("resnet", "TFResNetForImageClassification"), + ("segformer", "TFSegformerForImageClassification"), ("swin", "TFSwinForImageClassification"), ("vit", "TFViTForImageClassification"), ] @@ -189,6 +191,7 @@ [ # Model for Semantic Segmentation mapping ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"), + ("segformer", "TFSegformerForSemanticSegmentation"), ] ) diff --git a/src/transformers/models/segformer/__init__.py b/src/transformers/models/segformer/__init__.py index 1ce4ecb07a9c..2317237509a0 100644 --- a/src/transformers/models/segformer/__init__.py +++ b/src/transformers/models/segformer/__init__.py @@ -17,7 +17,13 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_tf_available, + is_torch_available, + is_vision_available, +) _import_structure = {"configuration_segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"]} @@ -46,6 +52,21 @@ "SegformerPreTrainedModel", ] +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_tf_segformer"] = [ + "TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFSegformerDecodeHead", + "TFSegformerForImageClassification", + "TFSegformerForSemanticSegmentation", + "TFSegformerModel", + "TFSegformerPreTrainedModel", + ] + if TYPE_CHECKING: from .configuration_segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig @@ -73,7 +94,20 @@ SegformerModel, SegformerPreTrainedModel, ) - + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_tf_segformer import ( + TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, + TFSegformerDecodeHead, + TFSegformerForImageClassification, + TFSegformerForSemanticSegmentation, + TFSegformerModel, + TFSegformerPreTrainedModel, + ) else: import sys diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index 22bf9234b2ae..9d0d6878a5cb 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -785,6 +785,8 @@ def forward( >>> inputs = feature_extractor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits # shape (batch_size, num_labels, height, width) + >>> logits.shape + (1, 150, 128, 128) ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict output_hidden_states = ( @@ -804,7 +806,7 @@ def forward( loss = None if labels is not None: - if self.config.num_labels == 1: + if not self.config.num_labels > 1: raise ValueError("The number of labels should be greater than one") else: # upsample logits to the images' original size diff --git a/src/transformers/models/segformer/modeling_tf_segformer.py b/src/transformers/models/segformer/modeling_tf_segformer.py new file mode 100644 index 000000000000..e64a10178bf8 --- /dev/null +++ b/src/transformers/models/segformer/modeling_tf_segformer.py @@ -0,0 +1,878 @@ +# coding=utf-8 +# Copyright 2022 NVIDIA The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TensorFlow SegFormer model.""" + +import math +from typing import Dict, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput +from ...modeling_tf_utils import TFPreTrainedModel, TFSequenceClassificationLoss, keras_serializable, unpack_inputs +from ...tf_utils import shape_list, stable_softmax +from ...utils import logging +from .configuration_segformer import SegformerConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "SegformerConfig" +_FEAT_EXTRACTOR_FOR_DOC = "SegformerFeatureExtractor" + +# Base docstring +_CHECKPOINT_FOR_DOC = "nvidia/mit-b0" +_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + +TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "nvidia/segformer-b0-finetuned-ade-512-512", + # See all SegFormer models at https://huggingface.co/models?filter=segformer +] + + +# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer +class TFSegformerDropPath(tf.keras.layers.Layer): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + References: + (1) github.com:rwightman/pytorch-image-models + """ + + def __init__(self, drop_path, **kwargs): + super().__init__(**kwargs) + self.drop_path = drop_path + + def call(self, x, training=None): + if training: + keep_prob = 1 - self.drop_path + shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1) + random_tensor = keep_prob + tf.random.uniform(shape, 0, 1) + random_tensor = tf.floor(random_tensor) + return (x / keep_prob) * random_tensor + return x + + +class TFSegformerOverlapPatchEmbeddings(tf.keras.layers.Layer): + """Construct the overlapping patch embeddings.""" + + def __init__(self, patch_size, stride, hidden_size, **kwargs): + super().__init__(**kwargs) + self.padding = tf.keras.layers.ZeroPadding2D(padding=patch_size // 2) + self.proj = tf.keras.layers.Conv2D( + filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj" + ) + + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + + def call(self, pixel_values: tf.Tensor) -> Tuple[tf.Tensor, int, int]: + embeddings = self.proj(self.padding(pixel_values)) + height = shape_list(embeddings)[1] + width = shape_list(embeddings)[2] + hidden_dim = shape_list(embeddings)[3] + # (batch_size, height, width, num_channels) -> (batch_size, height*width, num_channels) + # this can be fed to a Transformer layer + embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim)) + embeddings = self.layer_norm(embeddings) + return embeddings, height, width + + +class TFSegformerEfficientSelfAttention(tf.keras.layers.Layer): + """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT + paper](https://arxiv.org/abs/2102.12122).""" + + def __init__( + self, + config: SegformerConfig, + hidden_size: int, + num_attention_heads: int, + sequence_reduction_ratio: int, + **kwargs + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.num_attention_heads = num_attention_heads + + if self.hidden_size % self.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention " + f"heads ({self.num_attention_heads})" + ) + + self.attention_head_size = self.hidden_size // self.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = tf.keras.layers.Dense(self.all_head_size, name="query") + self.key = tf.keras.layers.Dense(self.all_head_size, name="key") + self.value = tf.keras.layers.Dense(self.all_head_size, name="value") + + self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + + self.sr_ratio = sequence_reduction_ratio + if sequence_reduction_ratio > 1: + self.sr = tf.keras.layers.Conv2D( + filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr" + ) + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm") + + def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] + # to [batch_size, seq_length, num_attention_heads, attention_head_size] + batch_size = shape_list(tensor)[0] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] + # to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + height: int, + width: int, + output_attentions: bool = False, + training: bool = False, + ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]: + batch_size = shape_list(hidden_states)[0] + num_channels = shape_list(hidden_states)[2] + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + if self.sr_ratio > 1: + # Reshape to (batch_size, height, width, num_channels) + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + # Apply sequence reduction + hidden_states = self.sr(hidden_states) + # Reshape back to (batch_size, seq_len, num_channels) + hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels)) + hidden_states = self.layer_norm(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + + scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, scale) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + context_layer = tf.matmul(attention_probs, value_layer) + + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size)) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + +class TFSegformerSelfOutput(tf.keras.layers.Layer): + def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(hidden_size, name="dense") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class TFSegformerAttention(tf.keras.layers.Layer): + def __init__( + self, + config: SegformerConfig, + hidden_size: int, + num_attention_heads: int, + sequence_reduction_ratio: int, + **kwargs + ): + super().__init__(**kwargs) + self.self = TFSegformerEfficientSelfAttention( + config=config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + name="self", + ) + self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output") + + def call( + self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False + ) -> Union[tf.Tensor, Tuple[tf.Tensor, tf.Tensor]]: + self_outputs = self.self(hidden_states, height, width, output_attentions) + + attention_output = self.dense_output(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class TFSegformerDWConv(tf.keras.layers.Layer): + def __init__(self, dim: int = 768, **kwargs): + super().__init__(**kwargs) + self.depthwise_convolution = tf.keras.layers.Conv2D( + filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv" + ) + + def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: + batch_size = shape_list(hidden_states)[0] + num_channels = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + hidden_states = self.depthwise_convolution(hidden_states) + + new_height = shape_list(hidden_states)[1] + new_width = shape_list(hidden_states)[2] + num_channels = shape_list(hidden_states)[3] + hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels)) + return hidden_states + + +class TFSegformerMixFFN(tf.keras.layers.Layer): + def __init__( + self, + config: SegformerConfig, + in_features: int, + hidden_features: int = None, + out_features: int = None, + **kwargs + ): + super().__init__(**kwargs) + out_features = out_features or in_features + self.dense1 = tf.keras.layers.Dense(hidden_features, name="dense1") + self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv") + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.dense2 = tf.keras.layers.Dense(out_features, name="dense2") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor: + hidden_states = self.dense1(hidden_states) + hidden_states = self.depthwise_convolution(hidden_states, height, width) + hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense2(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class TFSegformerLayer(tf.keras.layers.Layer): + """This corresponds to the Block class in the original implementation.""" + + def __init__( + self, + config, + hidden_size: int, + num_attention_heads: int, + drop_path: float, + sequence_reduction_ratio: int, + mlp_ratio: int, + **kwargs + ): + super().__init__(**kwargs) + self.layer_norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1") + self.attention = TFSegformerAttention( + config, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + sequence_reduction_ratio=sequence_reduction_ratio, + name="attention", + ) + self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else tf.keras.layers.Activation("linear") + self.layer_norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2") + mlp_hidden_size = int(hidden_size * mlp_ratio) + self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp") + + def call( + self, + hidden_states: tf.Tensor, + height: int, + width: int, + output_attentions: bool = False, + training: bool = False, + ) -> Tuple: + self_attention_outputs = self.attention( + self.layer_norm_1(hidden_states), # in Segformer, layernorm is applied before self-attention + height, + width, + output_attentions=output_attentions, + training=training, + ) + + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection (with stochastic depth) + attention_output = self.drop_path(attention_output, training=training) + hidden_states = attention_output + hidden_states + mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width) + + # second residual connection (with stochastic depth) + mlp_output = self.drop_path(mlp_output, training=training) + layer_output = mlp_output + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class TFSegformerEncoder(tf.keras.layers.Layer): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + + # stochastic depth decay rule + drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))] + + # patch embeddings + embeddings = [] + for i in range(config.num_encoder_blocks): + embeddings.append( + TFSegformerOverlapPatchEmbeddings( + patch_size=config.patch_sizes[i], + stride=config.strides[i], + hidden_size=config.hidden_sizes[i], + name=f"patch_embeddings.{i}", + ) + ) + self.embeddings = embeddings + + # Transformer blocks + blocks = [] + cur = 0 + for i in range(config.num_encoder_blocks): + # each block consists of layers + layers = [] + if i != 0: + cur += config.depths[i - 1] + for j in range(config.depths[i]): + layers.append( + TFSegformerLayer( + config, + hidden_size=config.hidden_sizes[i], + num_attention_heads=config.num_attention_heads[i], + drop_path=drop_path_decays[cur + j], + sequence_reduction_ratio=config.sr_ratios[i], + mlp_ratio=config.mlp_ratios[i], + name=f"block.{i}.{j}", + ) + ) + blocks.append(layers) + + self.block = blocks + + # Layer norms + self.layer_norms = [ + tf.keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}") + for i in range(config.num_encoder_blocks) + ] + + def call( + self, + pixel_values: tf.Tensor, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + training: bool = False, + ) -> Union[Tuple, TFBaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + batch_size = shape_list(pixel_values)[0] + + hidden_states = pixel_values + for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)): + embedding_layer, block_layer, norm_layer = x + # first, obtain patch embeddings + hidden_states, height, width = embedding_layer(hidden_states) + + # second, send embeddings through blocks + # (each block consists of multiple layers i.e., list of layers) + for i, blk in enumerate(block_layer): + layer_outputs = blk( + hidden_states, + height, + width, + output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + # third, apply layer norm + hidden_states = norm_layer(hidden_states) + + # fourth, optionally reshape back to (batch_size, height, width, num_channels) + if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage): + num_channels = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels)) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions + ) + + +@keras_serializable +class TFSegformerMainLayer(tf.keras.layers.Layer): + config_class = SegformerConfig + + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + # hierarchical Transformer encoder + self.encoder = TFSegformerEncoder(config, name="encoder") + + @unpack_inputs + def call( + self, + pixel_values: tf.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple, TFBaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # When running on CPU, `tf.keras.layers.Conv2D` doesn't support `NCHW` format. + # So change the input format from `NCHW` to `NHWC`. + # shape = (batch_size, in_height, in_width, in_channels=num_channels) + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + + encoder_outputs = self.encoder( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] + # Change to NCHW output format to have uniformity in the modules + sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2]) + + # Change the other hidden state outputs to NCHW as well + if output_hidden_states: + hidden_states = tuple([tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1]]) + + if not return_dict: + if tf.greater(len(encoder_outputs[1:]), 0): + transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0]) + return (sequence_output,) + (transposed_encoder_outputs,) + else: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class TFSegformerPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = SegformerConfig + base_model_prefix = "segformer" + main_input_name = "pixel_values" + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + VISION_DUMMY_INPUTS = tf.random.uniform(shape=(3, self.config.num_channels, 512, 512), dtype=tf.float32) + return {"pixel_values": tf.constant(VISION_DUMMY_INPUTS)} + + @tf.function( + input_signature=[ + { + "pixel_values": tf.TensorSpec((None, None, None, None), tf.float32, name="pixel_values"), + } + ] + ) + def serving(self, inputs): + """ + Method used for serving the model. + + Args: + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + return self.call(inputs) + + +SEGFORMER_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`SegformerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +SEGFORMER_INPUTS_DOCSTRING = r""" + + Args: + pixel_values (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoFeatureExtractor`]. See + [`AutoFeatureExtractor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.", + SEGFORMER_START_DOCSTRING, +) +class TFSegformerModel(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.config = config + + # hierarchical Transformer encoder + self.segformer = TFSegformerMainLayer(config, name="segformer") + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def call( + self, + pixel_values: tf.Tensor, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[Tuple, TFBaseModelOutput]: + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + +@add_start_docstrings( + """ + SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden + states) e.g. for ImageNet. + """, + SEGFORMER_START_DOCSTRING, +) +class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: SegformerConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + self.segformer = TFSegformerMainLayer(config, name="segformer") + + # Classifier head + self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_FEAT_EXTRACTOR_FOR_DOC, + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def call( + self, + pixel_values: Optional[tf.Tensor] = None, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TFSequenceClassifierOutput]: + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + # convert last hidden states to (batch_size, height*width, hidden_size) + batch_size = shape_list(sequence_output)[0] + sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1]) + sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1])) + + # global average pooling + sequence_output = tf.reduce_mean(sequence_output, axis=1) + + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) + + +class TFSegformerMLP(tf.keras.layers.Layer): + """ + Linear Embedding. + """ + + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(**kwargs) + self.proj = tf.keras.layers.Dense(config.decoder_hidden_size, name="proj") + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + height = shape_list(hidden_states)[1] + width = shape_list(hidden_states)[2] + hidden_dim = shape_list(hidden_states)[-1] + hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim)) + hidden_states = self.proj(hidden_states) + return hidden_states + + +class TFSegformerDecodeHead(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(config, **kwargs) + # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size + mlps = [] + for i in range(config.num_encoder_blocks): + mlp = TFSegformerMLP(config, name=f"linear_c.{i}") + mlps.append(mlp) + self.mlps = mlps + + # the following 3 layers implement the ConvModule of the original implementation + self.linear_fuse = tf.keras.layers.Conv2D( + filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse" + ) + self.batch_norm = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1, name="batch_norm") + self.activation = tf.keras.layers.Activation("relu") + + self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob) + self.classifier = tf.keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier") + + self.config = config + + def call(self, encoder_hidden_states): + batch_size = shape_list(encoder_hidden_states[-1])[0] + + all_hidden_states = () + for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps): + if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3: + height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32)) + height = width = tf.cast(height, tf.int32) + encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1)) + + # unify channel dimension + encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1]) + height = shape_list(encoder_hidden_state)[1] + width = shape_list(encoder_hidden_state)[2] + encoder_hidden_state = mlp(encoder_hidden_state) + encoder_hidden_state = tf.reshape(encoder_hidden_state, (batch_size, height, width, -1)) + + # upsample + temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1]) + upsample_resolution = shape_list(temp_state)[1:-1] + encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear") + all_hidden_states += (encoder_hidden_state,) + + hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1)) + hidden_states = self.batch_norm(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.dropout(hidden_states) + + # logits of shape (batch_size, height/4, width/4, num_labels) + logits = self.classifier(hidden_states) + + return logits + + +@add_start_docstrings( + """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""", + SEGFORMER_START_DOCSTRING, +) +class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel): + def __init__(self, config: SegformerConfig, **kwargs): + super().__init__(config, **kwargs) + self.segformer = TFSegformerMainLayer(config, name="segformer") + self.decode_head = TFSegformerDecodeHead(config, name="decode_head") + + def hf_compute_loss(self, logits, labels): + # upsample logits to the images' original size + # `labels` is of shape (batch_size, height, width) + label_interp_shape = shape_list(labels)[1:] + + upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear") + # compute weighted loss + loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none") + + def masked_loss(real, pred): + unmasked_loss = loss_fct(real, pred) + mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype) + masked_loss = unmasked_loss * mask + # Reduction strategy in the similar spirit with + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210 + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask) + return tf.reshape(reduced_masked_loss, (1,)) + + return masked_loss(labels, upsampled_logits) + + @unpack_inputs + @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + pixel_values: tf.Tensor, + labels: Optional[tf.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TFSemanticSegmenterOutput]: + r""" + labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a (per-pixel) classification loss is computed + (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import SegformerFeatureExtractor, TFSegformerForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") + >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") + + >>> inputs = feature_extractor(images=image, return_tensors="tf") + >>> outputs = model(**inputs, training=False) + >>> # logits are of shape (batch_size, num_labels, height, width) + >>> logits = outputs.logits + >>> logits.shape + (1, 150, 128, 128) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.segformer( + pixel_values, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + logits = self.decode_head(encoder_hidden_states) + + loss = None + if labels is not None: + if not self.config.num_labels > 1: + raise ValueError("The number of labels should be greater than one") + else: + loss = self.hf_compute_loss(logits=logits, labels=labels) + + # make logits of shape (batch_size, num_labels, height, width) to + # keep them consistent across APIs + logits = tf.transpose(logits, perm=[0, 3, 1, 2]) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 0b842fa359fa..37b58cd81466 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -1980,6 +1980,44 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) +TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFSegformerDecodeHead(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSegformerForImageClassification(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSegformerForSemanticSegmentation(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSegformerModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFSegformerPreTrainedModel(metaclass=DummyObject): + _backends = ["tf"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_SPEECH_TO_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py index 9af59299f8ec..6a1d273f6642 100644 --- a/tests/models/segformer/test_modeling_segformer.py +++ b/tests/models/segformer/test_modeling_segformer.py @@ -18,7 +18,7 @@ import inspect import unittest -from transformers import is_torch_available, is_vision_available +from transformers import SegformerConfig, is_torch_available, is_vision_available from transformers.models.auto import get_values from transformers.testing_utils import require_torch, slow, torch_device @@ -31,7 +31,6 @@ from transformers import ( MODEL_MAPPING, - SegformerConfig, SegformerForImageClassification, SegformerForSemanticSegmentation, SegformerModel, diff --git a/tests/models/segformer/test_modeling_tf_segformer.py b/tests/models/segformer/test_modeling_tf_segformer.py new file mode 100644 index 000000000000..fbf38fa1d6b2 --- /dev/null +++ b/tests/models/segformer/test_modeling_tf_segformer.py @@ -0,0 +1,540 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the TensorFlow SegFormer model. """ + +import inspect +import unittest +from typing import List, Tuple + +import numpy as np + +from transformers import SegformerConfig +from transformers.file_utils import is_tf_available, is_vision_available +from transformers.testing_utils import require_tf, slow + +from ...test_configuration_common import ConfigTester +from ...test_modeling_tf_common import TFModelTesterMixin, floats_tensor, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import TFSegformerForImageClassification, TFSegformerForSemanticSegmentation, TFSegformerModel + from transformers.models.segformer.modeling_tf_segformer import TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST + +if is_vision_available(): + from PIL import Image + + from transformers import SegformerFeatureExtractor + + +class TFSegformerConfigTester(ConfigTester): + def create_and_test_config_common_properties(self): + config = self.config_class(**self.inputs_dict) + self.parent.assertTrue(hasattr(config, "hidden_sizes")) + self.parent.assertTrue(hasattr(config, "num_attention_heads")) + self.parent.assertTrue(hasattr(config, "num_encoder_blocks")) + + +class TFSegformerModelTester: + def __init__( + self, + parent, + batch_size=13, + image_size=64, + num_channels=3, + num_encoder_blocks=4, + depths=[2, 2, 2, 2], + sr_ratios=[8, 4, 2, 1], + hidden_sizes=[16, 32, 64, 128], + downsampling_rates=[1, 4, 8, 16], + num_attention_heads=[1, 2, 4, 8], + is_training=True, + use_labels=True, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + num_labels=3, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.image_size = image_size + self.num_channels = num_channels + self.num_encoder_blocks = num_encoder_blocks + self.sr_ratios = sr_ratios + self.depths = depths + self.hidden_sizes = hidden_sizes + self.downsampling_rates = downsampling_rates + self.num_attention_heads = num_attention_heads + self.is_training = is_training + self.use_labels = use_labels + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.num_labels = num_labels + self.scope = scope + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + + labels = None + if self.use_labels: + labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels) + + config = self.get_config() + return config, pixel_values, labels + + def get_config(self): + return SegformerConfig( + image_size=self.image_size, + num_channels=self.num_channels, + num_encoder_blocks=self.num_encoder_blocks, + depths=self.depths, + hidden_sizes=self.hidden_sizes, + num_attention_heads=self.num_attention_heads, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + initializer_range=self.initializer_range, + num_labels=self.num_labels, + ) + + def create_and_check_model(self, config, pixel_values, labels): + model = TFSegformerModel(config=config) + result = model(pixel_values, training=False) + expected_height = expected_width = self.image_size // (self.downsampling_rates[-1] * 2) + self.parent.assertEqual( + result.last_hidden_state.shape, (self.batch_size, self.hidden_sizes[-1], expected_height, expected_width) + ) + + def create_and_check_for_image_segmentation(self, config, pixel_values, labels): + config.num_labels = self.num_labels + model = TFSegformerForSemanticSegmentation(config) + result = model(pixel_values, training=False) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4) + ) + result = model(pixel_values, labels=labels, training=False) + self.parent.assertEqual( + result.logits.shape, (self.batch_size, self.num_labels, self.image_size // 4, self.image_size // 4) + ) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, labels = config_and_inputs + inputs_dict = {"pixel_values": pixel_values} + return config, inputs_dict + + def prepare_config_and_inputs_for_keras_fit(self, for_segmentation: bool = False): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values, seg_labels = config_and_inputs + if for_segmentation: + inputs_dict = {"pixel_values": pixel_values, "labels": seg_labels} + else: + inputs_dict = {"pixel_values": pixel_values, "labels": tf.zeros((self.batch_size))} + return config, inputs_dict + + +@require_tf +class TFSegformerModelTest(TFModelTesterMixin, unittest.TestCase): + all_model_classes = ( + (TFSegformerModel, TFSegformerForImageClassification, TFSegformerForSemanticSegmentation) + if is_tf_available() + else () + ) + + test_head_masking = False + test_onnx = False + test_pruning = False + test_resize_embeddings = False + + def setUp(self): + self.model_tester = TFSegformerModelTester(self) + self.config_tester = TFSegformerConfigTester(self, config_class=SegformerConfig, has_text_modality=False) + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip("SegFormer does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip("SegFormer does not have get_input_embeddings method and get_output_embeddings methods") + def test_model_common_attributes(self): + pass + + @unittest.skip("Test was written for TF 1.x and isn't really relevant here") + def test_compile_tf_model(self): + pass + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.call) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values"] + self.assertListEqual(arg_names[:1], expected_arg_names) + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + expected_num_attentions = sum(self.model_tester.depths) + self.assertEqual(len(attentions), expected_num_attentions) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.attentions + + self.assertEqual(len(attentions), expected_num_attentions) + + # verify the first attentions (first block, first layer) + expected_seq_len = (self.model_tester.image_size // 4) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2 + self.assertListEqual( + list(attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len], + ) + + # verify the last attentions (last block, last layer) + expected_seq_len = (self.model_tester.image_size // 32) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (32 * self.model_tester.sr_ratios[-1])) ** 2 + self.assertListEqual( + list(attentions[-1].shape[-3:]), + [self.model_tester.num_attention_heads[-1], expected_seq_len, expected_reduced_seq_len], + ) + out_len = len(outputs) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 1, len(outputs)) + + self_attentions = outputs.attentions + + self.assertEqual(len(self_attentions), expected_num_attentions) + # verify the first attentions (first block, first layer) + expected_seq_len = (self.model_tester.image_size // 4) ** 2 + expected_reduced_seq_len = (self.model_tester.image_size // (4 * self.model_tester.sr_ratios[0])) ** 2 + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads[0], expected_seq_len, expected_reduced_seq_len], + ) + + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.hidden_states + + expected_num_layers = self.model_tester.num_encoder_blocks + self.assertEqual(len(hidden_states), expected_num_layers) + + # verify the first hidden states (first block) + self.assertListEqual( + list(hidden_states[0].shape[-3:]), + [ + self.model_tester.hidden_sizes[0], + self.model_tester.image_size // 4, + self.model_tester.image_size // 4, + ], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + def test_model_outputs_equivalence(self): + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}): + tuple_output = model(tuple_inputs, return_dict=False, **additional_kwargs) + dict_output = model(dict_inputs, return_dict=True, **additional_kwargs).to_tuple() + + def recursive_check(tuple_object, dict_object): + if isinstance(tuple_object, (List, Tuple)): + for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object): + recursive_check(tuple_iterable_value, dict_iterable_value) + elif tuple_object is None: + return + else: + self.assertTrue( + all(tf.equal(tuple_object, dict_object)), + msg=( + "Tuple and dict output are not equal. Difference:" + f" {tf.math.reduce_max(tf.abs(tuple_object - dict_object))}" + ), + ) + + recursive_check(tuple_output, dict_output) + + for model_class in self.all_model_classes: + model = model_class(config) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs) + + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) + + if self.has_attentions: + tuple_inputs = self._prepare_for_class(inputs_dict, model_class) + dict_inputs = self._prepare_for_class(inputs_dict, model_class) + check_equivalence(model, tuple_inputs, dict_inputs, {"output_attentions": True}) + + # todo: incorporate label support for semantic segmentation in `test_modeling_tf_common.py`. + + def test_dataset_conversion(self): + gpus = tf.config.list_physical_devices("GPU") + # Grouped convs aren't supported on CPUs for backprop. + if len(gpus) >= 1: + super().test_dataset_conversion() + + def test_keras_fit(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + gpus = tf.config.list_physical_devices("GPU") + + def apply(model): + if getattr(model, "hf_compute_loss", None): + model_weights = model.get_weights() + + # Test that model correctly compute the loss with kwargs + for_segmentation = True if model_class.__name__ == "TFSegformerForSemanticSegmentation" else False + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( + for_segmentation=for_segmentation + ) + + label_names = {"labels"} + self.assertGreater(len(label_names), 0, msg="No matching label names found!") + labels = {key: val for key, val in prepared_for_class.items() if key in label_names} + inputs_minus_labels = {key: val for key, val in prepared_for_class.items() if key not in label_names} + self.assertGreater(len(inputs_minus_labels), 0) + model.compile(optimizer=tf.keras.optimizers.SGD(0.0), run_eagerly=True) + + # Make sure the model fits without crashing regardless of where we pass the labels + history1 = model.fit( + prepared_for_class, + validation_data=prepared_for_class, + steps_per_epoch=1, + validation_steps=1, + shuffle=False, + ) + val_loss1 = history1.history["val_loss"][0] + + # We reinitialize the model here even though our learning rate was zero + # because BatchNorm updates weights by means other than gradient descent. + model.set_weights(model_weights) + history2 = model.fit( + inputs_minus_labels, + labels, + validation_data=(inputs_minus_labels, labels), + steps_per_epoch=1, + validation_steps=1, + shuffle=False, + ) + val_loss2 = history2.history["val_loss"][0] + self.assertTrue(np.allclose(val_loss1, val_loss2, atol=1e-2, rtol=1e-3)) + + for model_class in self.all_model_classes: + # Since `TFSegformerModel` cannot operate with the default `fit()` method. + if model_class.__name__ != "TFSegformerModel": + # Grouped convs and backprop with them isn't supported on CPUs. + model = model_class(config) + if len(gpus) > 1: + apply(model) + + def test_loss_computation(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + def apply(model): + for_segmentation = True if model_class.__name__ == "TFSegformerForSemanticSegmentation" else False + # The number of elements in the loss should be the same as the number of elements in the label + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( + for_segmentation=for_segmentation + ) + added_label = prepared_for_class[ + sorted(list(prepared_for_class.keys() - inputs_dict.keys()), reverse=True)[0] + ] + loss_size = tf.size(added_label) + + # Test that model correctly compute the loss with kwargs + possible_input_names = {"input_ids", "pixel_values", "input_features"} + input_name = possible_input_names.intersection(set(prepared_for_class)).pop() + model_input = prepared_for_class.pop(input_name) + + loss = model(model_input, **prepared_for_class)[0] + + if model_class.__name__ == "TFSegformerForSemanticSegmentation": + # Semantic segmentation loss is computed similarly as + # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210. + self.assertEqual(loss.shape, (1,)) + else: + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a dict + _, prepared_for_class = self.model_tester.prepare_config_and_inputs_for_keras_fit( + for_segmentation=for_segmentation + ) + loss = model(**prepared_for_class)[0] + + if model_class.__name__ == "TFSegformerForSemanticSegmentation": + self.assertEqual(loss.shape, (1,)) + else: + self.assertEqual(loss.shape, [loss_size]) + + # Test that model correctly compute the loss with a tuple + label_keys = prepared_for_class.keys() - inputs_dict.keys() + signature = inspect.signature(model.call).parameters + signature_names = list(signature.keys()) + + # Create a dictionary holding the location of the tensors in the tuple + tuple_index_mapping = {0: input_name} + for label_key in label_keys: + label_key_index = signature_names.index(label_key) + tuple_index_mapping[label_key_index] = label_key + sorted_tuple_index_mapping = sorted(tuple_index_mapping.items()) + # Initialize a list with their default values, update the values and convert to a tuple + list_input = [] + + for name in signature_names: + if name != "kwargs": + list_input.append(signature[name].default) + + for index, value in sorted_tuple_index_mapping: + list_input[index] = prepared_for_class[value] + + tuple_input = tuple(list_input) + + # Send to model + loss = model(tuple_input[:-1])[0] + if model_class.__name__ == "TFSegformerForSemanticSegmentation": + self.assertEqual(loss.shape, (1,)) + else: + self.assertEqual(loss.shape, [loss_size]) + + for model_class in self.all_model_classes: + # Since `TFSegformerModel` won't have labels against which we + # could compute loss. + if model_class.__name__ != "TFSegformerModel": + model = model_class(config) + apply(model) + + def check_pt_tf_outputs(self, tf_outputs, pt_outputs, model_class, tol=2e-4, name="outputs", attributes=None): + # We override with a slightly higher tol value, as semseg models tend to diverge a bit more + super().check_pt_tf_outputs(tf_outputs, pt_outputs, model_class, tol, name, attributes) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFSegformerModel.from_pretrained(model_name) + self.assertIsNotNone(model) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +@require_tf +class TFSegformerModelIntegrationTest(unittest.TestCase): + @slow + def test_inference_image_segmentation_ade(self): + # only resize + normalize + feature_extractor = SegformerFeatureExtractor( + image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False + ) + model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") + + image = prepare_img() + encoded_inputs = feature_extractor(images=image, return_tensors="tf") + pixel_values = encoded_inputs.pixel_values + + outputs = model(pixel_values, training=False) + + expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [[-4.6310, -5.5232, -6.2356], [-5.1921, -6.1444, -6.5996], [-5.4424, -6.2790, -6.7574]], + [[-12.1391, -13.3122, -13.9554], [-12.8732, -13.9352, -14.3563], [-12.9438, -13.8226, -14.2513]], + [[-12.5134, -13.4686, -14.4915], [-12.8669, -14.4343, -14.7758], [-13.2523, -14.5819, -15.0694]], + ] + ) + tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-4) + + @slow + def test_inference_image_segmentation_city(self): + # only resize + normalize + feature_extractor = SegformerFeatureExtractor( + image_scale=(512, 512), keep_ratio=False, align=False, do_random_crop=False + ) + model = TFSegformerForSemanticSegmentation.from_pretrained( + "nvidia/segformer-b1-finetuned-cityscapes-1024-1024" + ) + + image = prepare_img() + encoded_inputs = feature_extractor(images=image, return_tensors="tf") + pixel_values = encoded_inputs.pixel_values + + outputs = model(pixel_values, training=False) + + expected_shape = tf.TensorShape((1, model.config.num_labels, 128, 128)) + self.assertEqual(outputs.logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [[-13.5748, -13.9111, -12.6500], [-14.3500, -15.3683, -14.2328], [-14.7532, -16.0424, -15.6087]], + [[-17.1651, -15.8725, -12.9653], [-17.2580, -17.3718, -14.8223], [-16.6058, -16.8783, -16.7452]], + [[-3.6456, -3.0209, -1.4203], [-3.0797, -3.1959, -2.0000], [-1.8757, -1.9217, -1.6997]], + ] + ) + tf.debugging.assert_near(outputs.logits[0, :3, :3, :3], expected_slice, atol=1e-1) diff --git a/utils/check_repo.py b/utils/check_repo.py index 9905bb00544b..90d4218bdd6e 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -98,6 +98,7 @@ "FlaxBartForCausalLM", # Building part of bigger (tested) model. "FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM. "OPTDecoderWrapper", + "TFSegformerDecodeHead", # Not a regular model. ] # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't @@ -137,6 +138,7 @@ "PerceiverForMultimodalAutoencoding", "PerceiverForOpticalFlow", "SegformerDecodeHead", + "TFSegformerDecodeHead", "FlaxBeitForMaskedImageModeling", "PLBartEncoder", "PLBartDecoder", diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index d7aa04216abf..0d1216e5b828 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -64,6 +64,7 @@ src/transformers/models/sew_d/modeling_sew_d.py src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py src/transformers/models/speech_to_text/modeling_speech_to_text.py src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +src/transformers/models/segformer/modeling_tf_segformer.py src/transformers/models/swin/modeling_swin.py src/transformers/models/trocr/modeling_trocr.py src/transformers/models/unispeech/modeling_unispeech.py